From 47bb6726025c5badcd03c03549c5ee23603a6aaa Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Sat, 22 Jul 2017 14:23:01 -0700
Subject: [PATCH 1/3] NCCL 2.0 fix

---
 src/gpuarray_collectives_cuda_nccl.c | 32 ++++++++++++++--------------
 src/loaders/libnccl.fn               | 11 +++++-----
 src/loaders/libnccl.h                | 22 ++++++++++---------
 3 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c
index 99cd5f7e38..de80b715b6 100644
--- a/src/gpuarray_collectives_cuda_nccl.c
+++ b/src/gpuarray_collectives_cuda_nccl.c
@@ -153,7 +153,7 @@ static int get_rank(const gpucomm *comm, int *rank) {
  * \ref
  * ncclRedOp_t.
  *
- * If invalid, return `nccl_NUM_OPS`.
+ * If invalid, return `ncclNumOps`.
  */
 static inline ncclRedOp_t convert_reduce_op(int opcode) {
   switch (opcode) {
@@ -162,14 +162,14 @@ static inline ncclRedOp_t convert_reduce_op(int opcode) {
   case GA_MAX: return ncclMax;
   case GA_MIN: return ncclMin;
   }
-  return nccl_NUM_OPS;
+  return ncclNumOps;
 }
 
 /**
  * \brief Helper function to try to convert \ref enum GPUARRAY_TYPES to \ref
  * ncclDataType_t.
  *
- * If invalid, return `nccl_NUM_TYPES`.
+ * If invalid, return `ncclNumTypes`.
  */
 static inline ncclDataType_t convert_data_type(int typecode) {
   switch (typecode) {
@@ -181,7 +181,7 @@ static inline ncclDataType_t convert_data_type(int typecode) {
   case GA_ULONG: return ncclUint64;
   case GA_HALF: return ncclHalf;
   }
-  return nccl_NUM_TYPES;
+  return ncclNumTypes;
 }
 
 /**
@@ -208,13 +208,13 @@ static inline int check_restrictions(gpudata *src, size_t offsrc,
   // typecode must correspond to a valid ncclDataType_t
   if (datatype != NULL) {
     *datatype = convert_data_type(typecode);
-    if (*datatype == nccl_NUM_TYPES)
+    if (*datatype == ncclNumTypes)
       return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type");
   }
   // opcode must correspond to a valid ncclRedOp_t
   if (op != NULL) {
     *op = convert_reduce_op(opcode);
-    if (*op == nccl_NUM_OPS)
+    if (*op == ncclNumOps)
       return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op");
   }
   // offsets must not be larger than gpudata's size itself
@@ -237,8 +237,8 @@ static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest,
                   size_t count, int typecode, int opcode, int root,
                   gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclRedOp_t op = nccl_NUM_OPS;
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
   gpudata *dst = NULL;
   int rank = 0;
   cuda_context *ctx;
@@ -287,8 +287,8 @@ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest,
                       size_t offdest, size_t count, int typecode, int opcode,
                       gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclRedOp_t op = nccl_NUM_OPS;
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
   cuda_context *ctx;
 
   ASSERT_BUF(src);
@@ -325,8 +325,8 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
                           size_t offdest, size_t count, int typecode,
                           int opcode, gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclRedOp_t op = nccl_NUM_OPS;
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
   int ndev = 0;
   size_t resc_size;
   cuda_context *ctx;
@@ -371,7 +371,7 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
 static int broadcast(gpudata *array, size_t offset, size_t count, int typecode,
                      int root, gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclDataType_t datatype = ncclNumTypes;
   int rank = 0;
   cuda_context *ctx;
 
@@ -411,7 +411,7 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest,
                       size_t offdest, size_t count, int typecode,
                       gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclDataType_t datatype = ncclNumTypes;
   int ndev = 0;
   size_t resc_size;
   cuda_context *ctx;
@@ -439,8 +439,8 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest,
 
   // change stream of nccl ops to enable concurrency
   NCCL_EXIT_ON_ERROR(
-      ctx, ncclAllGather((void *)(src->ptr + offsrc), count, datatype,
-                         (void *)(dest->ptr + offdest), comm->c, ctx->s));
+      ctx, ncclAllGather((void *)(src->ptr + offsrc),
+			 (void *)(dest->ptr + offdest), count, datatype, comm->c, ctx->s));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));
diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn
index 64de5dd88e..f60fc38c65 100644
--- a/src/loaders/libnccl.fn
+++ b/src/loaders/libnccl.fn
@@ -4,9 +4,8 @@ DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm));
 DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count));
 DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank));
 DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result));
-DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, int count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclBcast, (void* buff, int count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, int count, ncclDataType_t datatype, void* recvbuff, ncclComm_t comm, cudaStream_t stream));
\ No newline at end of file
+DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream));
+DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream ));
+DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
+DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream ));
+DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream));
\ No newline at end of file
diff --git a/src/loaders/libnccl.h b/src/loaders/libnccl.h
index 0139878c8f..7d70138e03 100644
--- a/src/loaders/libnccl.h
+++ b/src/loaders/libnccl.h
@@ -13,21 +13,23 @@ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
 
 typedef enum { ncclSuccess = 0 } ncclResult_t;
 
+/* Reduction operation selector */
 typedef enum { ncclSum        = 0,
                ncclProd       = 1,
                ncclMax        = 2,
                ncclMin        = 3,
-               nccl_NUM_OPS   = 4 } ncclRedOp_t;
-
+               ncclNumOps     = 4 } ncclRedOp_t;
 /* Data types */
-typedef enum { ncclChar       = 0,
-               ncclInt        = 1,
-               ncclHalf       = 2,
-               ncclFloat      = 3,
-               ncclDouble     = 4,
-               ncclInt64      = 5,
-               ncclUint64     = 6,
-               nccl_NUM_TYPES = 7 } ncclDataType_t;
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclNumTypes   = 9 } ncclDataType_t;
 
 /** @endcond */
 

From c1d3e80c6a557d4a59746fcecbf0b5bfbc49e6f1 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Mon, 21 Aug 2017 16:06:07 -0400
Subject: [PATCH 2/3] Block loading of nccl 1.0

---
 src/loaders/libnccl.fn | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn
index f60fc38c65..bb9f3ddefd 100644
--- a/src/loaders/libnccl.fn
+++ b/src/loaders/libnccl.fn
@@ -8,4 +8,6 @@ DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t
 DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream ));
 DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
 DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream ));
-DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream));
\ No newline at end of file
+DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream));
+// We don't need this but we use it as a sentinel to prevent nccl 1.0 from loading.
+DEF_PROC(ncclResult_t, ncclGroupStart, ());

From 981ff8ef4273b92d740617b2dac3767f06c5cfdd Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Tue, 22 Aug 2017 17:54:56 -0400
Subject: [PATCH 3/3] Better error message for old NCCL.

---
 src/loaders/libnccl.c  | 3 +++
 src/loaders/libnccl.fn | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/loaders/libnccl.c b/src/loaders/libnccl.c
index 4ef247d117..08d5643330 100644
--- a/src/loaders/libnccl.c
+++ b/src/loaders/libnccl.c
@@ -40,6 +40,9 @@ int load_libnccl(error *e) {
 
   #include "libnccl.fn"
 
+  if (ga_func_ptr(lib, "ncclGroupStart", e) == NULL)
+    return error_set(e, GA_LOAD_ERROR, "Found NCCL 1.0 but NCCL 2.0 required");
+
   loaded = 1;
   return GA_NO_ERROR;
 }
diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn
index bb9f3ddefd..caf365b849 100644
--- a/src/loaders/libnccl.fn
+++ b/src/loaders/libnccl.fn
@@ -9,5 +9,3 @@ DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, siz
 DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
 DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream ));
 DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream));
-// We don't need this but we use it as a sentinel to prevent nccl 1.0 from loading.
-DEF_PROC(ncclResult_t, ncclGroupStart, ());