Adapt the poisson3DbMPI tutorial to cuda backend #286

rbourgeois33 · 2025-03-19T14:59:49Z

Hi,

I want to compare the performances of the CUDA backend versus the Vexcl backend in a MPI context.

To do so, I am trying to convert the tutorial code to CUDA. I got rid of the simple precision option so that it should be possible to achieve. But I cannot manage to find how to write the equivalent of:

    vex::Context ctx(vex::Filter::Exclusive(vex::Filter::Count(1)));
    for(int i = 0; i < world.size; ++i) {
        // unclutter the output:
        if (i == world.rank)
            std::cout << world.rank << ": " << ctx.queue(0) << std::endl;
        MPI_Barrier(world);
    }

and

// Copy the RHS vector to the backend:
    vex::vector<double> f(ctx, rhs);

As a result, my code is not using several GPUs even with mpirun -np 2.

Is there such an option ?

Thanks in advance,
Rémi

PS: Here is my code at the moment

#include <vector>
#include <iostream>

#include <amgcl/adapter/crs_tuple.hpp>

#include <amgcl/backend/cuda.hpp>
#include <amgcl/mpi/distributed_matrix.hpp>
#include <amgcl/mpi/make_solver.hpp>
#include <amgcl/mpi/amg.hpp>
#include <amgcl/mpi/coarsening/smoothed_aggregation.hpp>
#include <amgcl/mpi/relaxation/spai0.hpp>
#include <amgcl/relaxation/cusparse_ilu0.hpp>
#include <amgcl/mpi/solver/cg.hpp>

#include <amgcl/io/binary.hpp>
#include <amgcl/profiler.hpp>

#if defined(AMGCL_HAVE_PARMETIS)
#  include <amgcl/mpi/partition/parmetis.hpp>
#elif defined(AMGCL_HAVE_SCOTCH)
#  include <amgcl/mpi/partition/ptscotch.hpp>
#endif

//---------------------------------------------------------------------------
int main(int argc, char *argv[]) {
    // The matrix and the RHS file names should be in the command line options:
    if (argc < 3) {
        std::cerr << "Usage: " << argv[0] << " <matrix.bin> <rhs.bin>" << std::endl;
        return 1;
    }

    amgcl::mpi::init mpi(&argc, &argv);
    amgcl::mpi::communicator world(MPI_COMM_WORLD);

    // Show the name of the GPU we are using:
    int device;
    cudaDeviceProp prop;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&prop, device);
    std::cout << "GPU Device " << device << ": " << prop.name << std::endl;

    // The profiler:
    amgcl::profiler<> prof("matrix CUDA+MPI");

    // Read the system matrix and the RHS:
    prof.tic("read");
    // Get the global size of the matrix:
    ptrdiff_t rows = amgcl::io::crs_size<ptrdiff_t>(argv[1]);
    ptrdiff_t cols;

    // Split the matrix into approximately equal chunks of rows
    ptrdiff_t chunk = (rows + world.size - 1) / world.size;
    ptrdiff_t row_beg = std::min(rows, chunk * world.rank);
    ptrdiff_t row_end = std::min(rows, row_beg + chunk);
    chunk = row_end - row_beg;

    // Read our part of the system matrix and the RHS.
    std::vector<ptrdiff_t> ptr, col;
    std::vector<double> val, rhs;
    amgcl::io::read_crs(argv[1], rows, ptr, col, val, row_beg, row_end);
    amgcl::io::read_dense(argv[2], rows, cols, rhs, row_beg, row_end);
    prof.toc("read");

    if (world.rank == 0)
        std::cout
            << "World size: " << world.size << std::endl
            << "Matrix " << argv[1] << ": " << rows << "x" << rows << std::endl
            << "RHS " << argv[2] << ": " << rows << "x" << cols << std::endl;

    // Compose the solver type
    //typedef amgcl::backend::builtin<double> Backend;
    typedef amgcl::backend::cuda<double> Backend;
    // We need to initialize the CUSPARSE library and pass the handle to AMGCL
    // in backend parameters:
    Backend::params bprm;
    cusparseCreate(&bprm.cusparse_handle);

    typedef amgcl::mpi::make_solver<
        amgcl::mpi::amg<
            Backend,
            amgcl::mpi::coarsening::smoothed_aggregation<Backend>,
            amgcl::mpi::relaxation::spai0<Backend>
            >,
        amgcl::mpi::solver::cg<Backend>
        > Solver;

    // Create the distributed matrix from the local parts.
    auto A = std::make_shared<amgcl::mpi::distributed_matrix<Backend>>(
            world, std::tie(chunk, ptr, col, val));

    // Partition the matrix and the RHS vector.
    // If neither ParMETIS not PT-SCOTCH are not available,
    // just keep the current naive partitioning.
#if defined(AMGCL_HAVE_PARMETIS) || defined(AMGCL_HAVE_SCOTCH)
#  if defined(AMGCL_HAVE_PARMETIS)
    typedef amgcl::mpi::partition::parmetis<Backend> Partition;
#  elif defined(AMGCL_HAVE_SCOTCH)
    typedef amgcl::mpi::partition::ptscotch<Backend> Partition;
#  endif

    if (world.size > 1) {
        prof.tic("partition");
        Partition part;

        // part(A) returns the distributed permutation matrix:
        auto P = part(*A);
        auto R = transpose(*P);

        // Reorder the matrix:
        A = product(*R, *product(*A, *P));

        // and the RHS vector:
        std::vector<double> new_rhs(R->loc_rows());
        R->move_to_backend(typename Backend::params());
        amgcl::backend::spmv(1, *R, rhs, 0, new_rhs);
        rhs.swap(new_rhs);

        // Update the number of the local rows
        // (it may have changed as a result of permutation):
        chunk = A->loc_rows();
        prof.toc("partition");
    }
#endif

    Solver::params prm;
    prm.solver.tol = 5e-4;
    prm.solver.maxiter = 100;
    prm.precond.npre=1;
    prm.precond.npost=1;
    prm.precond.max_levels=100;
    prm.precond.coarse_enough=2;
    prm.precond.direct_coarse=true;
    prm.precond.ncycle=1;

    // Initialize the solver:
    prof.tic("setup");
    Solver solve(world, A, prm, bprm);
    prof.toc("setup");

    // Show the mini-report on the constructed solver:
    if (world.rank == 0)
        std::cout << solve << std::endl;

    // Solve the system with the zero initial approximation:
    int iters;
    double error;
    thrust::device_vector<double> f(rhs);
    thrust::device_vector<double> x(chunk, 0.0);
   
    prof.tic("solve");
    std::tie(iters, error) = solve(*A, f, x);
    prof.toc("solve");

    // Output the number of iterations, the relative error,
    // and the profiling data:
    if (world.rank == 0)
        std::cout
            << "Iters: " << iters << std::endl
            << "Error: " << error << std::endl
            << prof << std::endl;
}

The text was updated successfully, but these errors were encountered:

ddemidov · 2025-03-22T09:29:20Z

There is an example examples/mpi/mpi_solver.cpp, that supports the CUDA backend with MPI. It is configured here: you need to copy the code to the .cu file, and define SOLVER_BACKEND_CUDA preprocessor variable.

In the example, thrust::device_vector<T> is used instead of vec::vector<T> for GPU vectors.

There is no vex::Context equivalent for CUDA that I know of, so you probably have to explicitly use mpi rank to select the correct device for each of mpi processes, or use CUDA_VISIBLE_DEVICES environment variable.

see https://github.com/copilot/share/42235316-42c0-8404-8902-900164806815 for some insights.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Adapt the poisson3DbMPI tutorial to cuda backend #286

Adapt the poisson3DbMPI tutorial to cuda backend #286

rbourgeois33 commented Mar 19, 2025 •

edited

Loading

ddemidov commented Mar 22, 2025

Adapt the poisson3DbMPI tutorial to cuda backend #286

Adapt the poisson3DbMPI tutorial to cuda backend #286

Comments

rbourgeois33 commented Mar 19, 2025 • edited Loading

ddemidov commented Mar 22, 2025

rbourgeois33 commented Mar 19, 2025 •

edited

Loading