You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I want to compare the performances of the CUDA backend versus the Vexcl backend in a MPI context.
To do so, I am trying to convert the tutorial code to CUDA. I got rid of the simple precision option so that it should be possible to achieve. But I cannot manage to find how to write the equivalent of:
vex::Context ctx(vex::Filter::Exclusive(vex::Filter::Count(1)));
for(int i = 0; i < world.size; ++i) {
// unclutter the output:if (i == world.rank)
std::cout << world.rank << ": " << ctx.queue(0) << std::endl;
MPI_Barrier(world);
}
and
// Copy the RHS vector to the backend:
vex::vector<double> f(ctx, rhs);
As a result, my code is not using several GPUs even with mpirun -np 2.
Is there such an option ?
Thanks in advance,
Rémi
PS: Here is my code at the moment
#include<vector>
#include<iostream>
#include<amgcl/adapter/crs_tuple.hpp>
#include<amgcl/backend/cuda.hpp>
#include<amgcl/mpi/distributed_matrix.hpp>
#include<amgcl/mpi/make_solver.hpp>
#include<amgcl/mpi/amg.hpp>
#include<amgcl/mpi/coarsening/smoothed_aggregation.hpp>
#include<amgcl/mpi/relaxation/spai0.hpp>
#include<amgcl/relaxation/cusparse_ilu0.hpp>
#include<amgcl/mpi/solver/cg.hpp>
#include<amgcl/io/binary.hpp>
#include<amgcl/profiler.hpp>
#if defined(AMGCL_HAVE_PARMETIS)
# include<amgcl/mpi/partition/parmetis.hpp>
#elif defined(AMGCL_HAVE_SCOTCH)
# include<amgcl/mpi/partition/ptscotch.hpp>
#endif//---------------------------------------------------------------------------intmain(int argc, char *argv[]) {
// The matrix and the RHS file names should be in the command line options:if (argc < 3) {
std::cerr << "Usage: " << argv[0] << " <matrix.bin> <rhs.bin>" << std::endl;
return1;
}
amgcl::mpi::init mpi(&argc, &argv);
amgcl::mpi::communicator world(MPI_COMM_WORLD);
// Show the name of the GPU we are using:int device;
cudaDeviceProp prop;
cudaGetDevice(&device);
cudaGetDeviceProperties(&prop, device);
std::cout << "GPU Device " << device << ": " << prop.name << std::endl;
// The profiler:
amgcl::profiler<> prof("matrix CUDA+MPI");
// Read the system matrix and the RHS:
prof.tic("read");
// Get the global size of the matrix:ptrdiff_t rows = amgcl::io::crs_size<ptrdiff_t>(argv[1]);
ptrdiff_t cols;
// Split the matrix into approximately equal chunks of rowsptrdiff_t chunk = (rows + world.size - 1) / world.size;
ptrdiff_t row_beg = std::min(rows, chunk * world.rank);
ptrdiff_t row_end = std::min(rows, row_beg + chunk);
chunk = row_end - row_beg;
// Read our part of the system matrix and the RHS.
std::vector<ptrdiff_t> ptr, col;
std::vector<double> val, rhs;
amgcl::io::read_crs(argv[1], rows, ptr, col, val, row_beg, row_end);
amgcl::io::read_dense(argv[2], rows, cols, rhs, row_beg, row_end);
prof.toc("read");
if (world.rank == 0)
std::cout
<< "World size: " << world.size << std::endl
<< "Matrix " << argv[1] << ": " << rows << "x" << rows << std::endl
<< "RHS " << argv[2] << ": " << rows << "x" << cols << std::endl;
// Compose the solver type//typedef amgcl::backend::builtin<double> Backend;typedef amgcl::backend::cuda<double> Backend;
// We need to initialize the CUSPARSE library and pass the handle to AMGCL// in backend parameters:
Backend::params bprm;
cusparseCreate(&bprm.cusparse_handle);
typedef amgcl::mpi::make_solver<
amgcl::mpi::amg<
Backend,
amgcl::mpi::coarsening::smoothed_aggregation<Backend>,
amgcl::mpi::relaxation::spai0<Backend>
>,
amgcl::mpi::solver::cg<Backend>
> Solver;
// Create the distributed matrix from the local parts.auto A = std::make_shared<amgcl::mpi::distributed_matrix<Backend>>(
world, std::tie(chunk, ptr, col, val));
// Partition the matrix and the RHS vector.// If neither ParMETIS not PT-SCOTCH are not available,// just keep the current naive partitioning.
#if defined(AMGCL_HAVE_PARMETIS) || defined(AMGCL_HAVE_SCOTCH)
# if defined(AMGCL_HAVE_PARMETIS)
typedef amgcl::mpi::partition::parmetis<Backend> Partition;
# elif defined(AMGCL_HAVE_SCOTCH)
typedef amgcl::mpi::partition::ptscotch<Backend> Partition;
# endifif (world.size > 1) {
prof.tic("partition");
Partition part;
// part(A) returns the distributed permutation matrix:auto P = part(*A);
auto R = transpose(*P);
// Reorder the matrix:
A = product(*R, *product(*A, *P));
// and the RHS vector:
std::vector<double> new_rhs(R->loc_rows());
R->move_to_backend(typenameBackend::params());
amgcl::backend::spmv(1, *R, rhs, 0, new_rhs);
rhs.swap(new_rhs);
// Update the number of the local rows// (it may have changed as a result of permutation):
chunk = A->loc_rows();
prof.toc("partition");
}
#endif
Solver::params prm;
prm.solver.tol = 5e-4;
prm.solver.maxiter = 100;
prm.precond.npre=1;
prm.precond.npost=1;
prm.precond.max_levels=100;
prm.precond.coarse_enough=2;
prm.precond.direct_coarse=true;
prm.precond.ncycle=1;
// Initialize the solver:
prof.tic("setup");
Solver solve(world, A, prm, bprm);
prof.toc("setup");
// Show the mini-report on the constructed solver:if (world.rank == 0)
std::cout << solve << std::endl;
// Solve the system with the zero initial approximation:int iters;
double error;
thrust::device_vector<double> f(rhs);
thrust::device_vector<double> x(chunk, 0.0);
prof.tic("solve");
std::tie(iters, error) = solve(*A, f, x);
prof.toc("solve");
// Output the number of iterations, the relative error,// and the profiling data:if (world.rank == 0)
std::cout
<< "Iters: " << iters << std::endl
<< "Error: " << error << std::endl
<< prof << std::endl;
}
The text was updated successfully, but these errors were encountered:
There is an example examples/mpi/mpi_solver.cpp, that supports the CUDA backend with MPI. It is configured here: you need to copy the code to the .cu file, and define SOLVER_BACKEND_CUDA preprocessor variable.
In the example, thrust::device_vector<T> is used instead of vec::vector<T> for GPU vectors.
There is no vex::Context equivalent for CUDA that I know of, so you probably have to explicitly use mpi rank to select the correct device for each of mpi processes, or use CUDA_VISIBLE_DEVICES environment variable.
Hi,
I want to compare the performances of the CUDA backend versus the Vexcl backend in a MPI context.
To do so, I am trying to convert the tutorial code to CUDA. I got rid of the simple precision option so that it should be possible to achieve. But I cannot manage to find how to write the equivalent of:
and
As a result, my code is not using several GPUs even with
mpirun -np 2
.Is there such an option ?
Thanks in advance,
Rémi
PS: Here is my code at the moment
The text was updated successfully, but these errors were encountered: