From 47bb6726025c5badcd03c03549c5ee23603a6aaa Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Sat, 22 Jul 2017 14:23:01 -0700 Subject: [PATCH 1/3] NCCL 2.0 fix --- src/gpuarray_collectives_cuda_nccl.c | 32 ++++++++++++++-------------- src/loaders/libnccl.fn | 11 +++++----- src/loaders/libnccl.h | 22 ++++++++++--------- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index 99cd5f7e38..de80b715b6 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -153,7 +153,7 @@ static int get_rank(const gpucomm *comm, int *rank) { * \ref * ncclRedOp_t. * - * If invalid, return `nccl_NUM_OPS`. + * If invalid, return `ncclNumOps`. */ static inline ncclRedOp_t convert_reduce_op(int opcode) { switch (opcode) { @@ -162,14 +162,14 @@ static inline ncclRedOp_t convert_reduce_op(int opcode) { case GA_MAX: return ncclMax; case GA_MIN: return ncclMin; } - return nccl_NUM_OPS; + return ncclNumOps; } /** * \brief Helper function to try to convert \ref enum GPUARRAY_TYPES to \ref * ncclDataType_t. * - * If invalid, return `nccl_NUM_TYPES`. + * If invalid, return `ncclNumTypes`. */ static inline ncclDataType_t convert_data_type(int typecode) { switch (typecode) { @@ -181,7 +181,7 @@ static inline ncclDataType_t convert_data_type(int typecode) { case GA_ULONG: return ncclUint64; case GA_HALF: return ncclHalf; } - return nccl_NUM_TYPES; + return ncclNumTypes; } /** @@ -208,13 +208,13 @@ static inline int check_restrictions(gpudata *src, size_t offsrc, // typecode must correspond to a valid ncclDataType_t if (datatype != NULL) { *datatype = convert_data_type(typecode); - if (*datatype == nccl_NUM_TYPES) + if (*datatype == ncclNumTypes) return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type"); } // opcode must correspond to a valid ncclRedOp_t if (op != NULL) { *op = convert_reduce_op(opcode); - if (*op == nccl_NUM_OPS) + if (*op == ncclNumOps) return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op"); } // offsets must not be larger than gpudata's size itself @@ -237,8 +237,8 @@ static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclRedOp_t op = nccl_NUM_OPS; - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclRedOp_t op = ncclNumOps; + ncclDataType_t datatype = ncclNumTypes; gpudata *dst = NULL; int rank = 0; cuda_context *ctx; @@ -287,8 +287,8 @@ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclRedOp_t op = nccl_NUM_OPS; - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclRedOp_t op = ncclNumOps; + ncclDataType_t datatype = ncclNumTypes; cuda_context *ctx; ASSERT_BUF(src); @@ -325,8 +325,8 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclRedOp_t op = nccl_NUM_OPS; - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclRedOp_t op = ncclNumOps; + ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; @@ -371,7 +371,7 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, static int broadcast(gpudata *array, size_t offset, size_t count, int typecode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclDataType_t datatype = ncclNumTypes; int rank = 0; cuda_context *ctx; @@ -411,7 +411,7 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; @@ -439,8 +439,8 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest, // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR( - ctx, ncclAllGather((void *)(src->ptr + offsrc), count, datatype, - (void *)(dest->ptr + offdest), comm->c, ctx->s)); + ctx, ncclAllGather((void *)(src->ptr + offsrc), + (void *)(dest->ptr + offdest), count, datatype, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn index 64de5dd88e..f60fc38c65 100644 --- a/src/loaders/libnccl.fn +++ b/src/loaders/libnccl.fn @@ -4,9 +4,8 @@ DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm)); DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count)); DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank)); DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result)); -DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, int count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, -cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclBcast, (void* buff, int count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, int count, ncclDataType_t datatype, void* recvbuff, ncclComm_t comm, cudaStream_t stream)); \ No newline at end of file +DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)); +DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream )); +DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); +DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream )); +DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)); \ No newline at end of file diff --git a/src/loaders/libnccl.h b/src/loaders/libnccl.h index 0139878c8f..7d70138e03 100644 --- a/src/loaders/libnccl.h +++ b/src/loaders/libnccl.h @@ -13,21 +13,23 @@ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; typedef enum { ncclSuccess = 0 } ncclResult_t; +/* Reduction operation selector */ typedef enum { ncclSum = 0, ncclProd = 1, ncclMax = 2, ncclMin = 3, - nccl_NUM_OPS = 4 } ncclRedOp_t; - + ncclNumOps = 4 } ncclRedOp_t; /* Data types */ -typedef enum { ncclChar = 0, - ncclInt = 1, - ncclHalf = 2, - ncclFloat = 3, - ncclDouble = 4, - ncclInt64 = 5, - ncclUint64 = 6, - nccl_NUM_TYPES = 7 } ncclDataType_t; +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, + ncclNumTypes = 9 } ncclDataType_t; /** @endcond */ From c1d3e80c6a557d4a59746fcecbf0b5bfbc49e6f1 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 21 Aug 2017 16:06:07 -0400 Subject: [PATCH 2/3] Block loading of nccl 1.0 --- src/loaders/libnccl.fn | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn index f60fc38c65..bb9f3ddefd 100644 --- a/src/loaders/libnccl.fn +++ b/src/loaders/libnccl.fn @@ -8,4 +8,6 @@ DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream )); DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream )); -DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)); \ No newline at end of file +DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)); +// We don't need this but we use it as a sentinel to prevent nccl 1.0 from loading. +DEF_PROC(ncclResult_t, ncclGroupStart, ()); From 981ff8ef4273b92d740617b2dac3767f06c5cfdd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 22 Aug 2017 17:54:56 -0400 Subject: [PATCH 3/3] Better error message for old NCCL. --- src/loaders/libnccl.c | 3 +++ src/loaders/libnccl.fn | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/loaders/libnccl.c b/src/loaders/libnccl.c index 4ef247d117..08d5643330 100644 --- a/src/loaders/libnccl.c +++ b/src/loaders/libnccl.c @@ -40,6 +40,9 @@ int load_libnccl(error *e) { #include "libnccl.fn" + if (ga_func_ptr(lib, "ncclGroupStart", e) == NULL) + return error_set(e, GA_LOAD_ERROR, "Found NCCL 1.0 but NCCL 2.0 required"); + loaded = 1; return GA_NO_ERROR; } diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn index bb9f3ddefd..caf365b849 100644 --- a/src/loaders/libnccl.fn +++ b/src/loaders/libnccl.fn @@ -9,5 +9,3 @@ DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, siz DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream )); DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)); -// We don't need this but we use it as a sentinel to prevent nccl 1.0 from loading. -DEF_PROC(ncclResult_t, ncclGroupStart, ());