diff --git a/src/gpuarray/ext_cuda.h b/src/gpuarray/ext_cuda.h index 4b6377fa2b..4231c4f455 100644 --- a/src/gpuarray/ext_cuda.h +++ b/src/gpuarray/ext_cuda.h @@ -2,6 +2,7 @@ #define LIBGPU_EXT_CUDA #include +#include #include #include diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 071ec9700d..cd362f985c 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -2,6 +2,7 @@ #include "private.h" #include "private_cuda.h" + #include "loaders/libnvrtc.h" #include "loaders/libcublas.h" @@ -465,6 +466,24 @@ size_t cuda_get_sz(gpudata *g) { ASSERT_BUF(g); return g->sz; } } static const char CUDA_PREAMBLE[] = + "typedef struct { unsigned short x; } __half;\n" + "#define __HALF_TO_US(var) *(reinterpret_cast(&(var)))\n" + "#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var)))\n" + "#if __CUDACC_VER_MAJOR__ >= 9\n" + "#define __CUDA_FP16_DECL__ static __device__ __forceinline__\n" + "__CUDA_FP16_DECL__ __half __float2half_rn(const float f) \n" + "{\n" + " __half val;\n" + " asm(\"{ cvt.rn.f16.f32 %0, %1;}\\n\" : \"=h\"(__HALF_TO_US(val)) : \"f\"(f));\n" + " return val;\n" + "}\n" + "__CUDA_FP16_DECL__ float __half2float(const __half h)\n" + "{\n" + " float val;\n" + " asm(\"{ cvt.f32.f16 %0, %1;}\\n\" : \"=f\"(val) : \"h\"(__HALF_TO_CUS(h)));\n" + " return val;\n" + "}\n " + "#endif\n" "#define local_barrier() __syncthreads()\n" "#define WITHIN_KERNEL extern \"C\" __device__\n" "#define KERNEL extern \"C\" __global__\n" @@ -502,11 +521,17 @@ static const char CUDA_PREAMBLE[] = "#define ga_ulong unsigned long long\n" "#define ga_float float\n" "#define ga_double double\n" - "#define ga_half ga_ushort\n" + "#define ga_half __half\n" "#define ga_size size_t\n" "#define ga_ssize ptrdiff_t\n" - "#define load_half(p) __half2float(*(p))\n" - "#define store_half(p, v) (*(p) = __float2half_rn(v))\n" + "#define load_half(p) __half2float(*((__half*)p))\n" + "#if __CUDACC_VER_MAJOR__ >= 9\n" + "# define store_half(p, v) (*((__half*)p) = __float2half_rn(v))\n" + "# define __float2half_as_us(v) __float2half_rn(v).x\n" + "#else\n" + "# define store_half(p, v) (*((unsigned short*)p) = __float2half_rn(v))\n" + "# define __float2half_as_us __float2half_rn\n" + "#endif\n" "#define GA_DECL_SHARED_PARAM(type, name)\n" "#define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[];\n" "#define GA_WARP_SIZE warpSize\n" @@ -1098,9 +1123,11 @@ static inline int error_nvrtc(error *e, const char *msg, nvrtcResult err) { static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { nvrtcProgram prog; size_t buflen; - const char *opts[4] = { + const char *opts[] = { "-arch", "" +#ifdef DEBUG , "-G", "-lineinfo" +#endif }; nvrtcResult err; @@ -1111,11 +1138,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { return error_nvrtc(ctx->err, "nvrtcCreateProgram", err); err = nvrtcCompileProgram(prog, -#ifdef DEBUG - 4, -#else - 2, -#endif + sizeof(opts)/sizeof(char*), opts); /* Get the log before handling the error */ diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index 99cd5f7e38..de80b715b6 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -153,7 +153,7 @@ static int get_rank(const gpucomm *comm, int *rank) { * \ref * ncclRedOp_t. * - * If invalid, return `nccl_NUM_OPS`. + * If invalid, return `ncclNumOps`. */ static inline ncclRedOp_t convert_reduce_op(int opcode) { switch (opcode) { @@ -162,14 +162,14 @@ static inline ncclRedOp_t convert_reduce_op(int opcode) { case GA_MAX: return ncclMax; case GA_MIN: return ncclMin; } - return nccl_NUM_OPS; + return ncclNumOps; } /** * \brief Helper function to try to convert \ref enum GPUARRAY_TYPES to \ref * ncclDataType_t. * - * If invalid, return `nccl_NUM_TYPES`. + * If invalid, return `ncclNumTypes`. */ static inline ncclDataType_t convert_data_type(int typecode) { switch (typecode) { @@ -181,7 +181,7 @@ static inline ncclDataType_t convert_data_type(int typecode) { case GA_ULONG: return ncclUint64; case GA_HALF: return ncclHalf; } - return nccl_NUM_TYPES; + return ncclNumTypes; } /** @@ -208,13 +208,13 @@ static inline int check_restrictions(gpudata *src, size_t offsrc, // typecode must correspond to a valid ncclDataType_t if (datatype != NULL) { *datatype = convert_data_type(typecode); - if (*datatype == nccl_NUM_TYPES) + if (*datatype == ncclNumTypes) return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type"); } // opcode must correspond to a valid ncclRedOp_t if (op != NULL) { *op = convert_reduce_op(opcode); - if (*op == nccl_NUM_OPS) + if (*op == ncclNumOps) return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op"); } // offsets must not be larger than gpudata's size itself @@ -237,8 +237,8 @@ static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclRedOp_t op = nccl_NUM_OPS; - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclRedOp_t op = ncclNumOps; + ncclDataType_t datatype = ncclNumTypes; gpudata *dst = NULL; int rank = 0; cuda_context *ctx; @@ -287,8 +287,8 @@ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclRedOp_t op = nccl_NUM_OPS; - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclRedOp_t op = ncclNumOps; + ncclDataType_t datatype = ncclNumTypes; cuda_context *ctx; ASSERT_BUF(src); @@ -325,8 +325,8 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclRedOp_t op = nccl_NUM_OPS; - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclRedOp_t op = ncclNumOps; + ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; @@ -371,7 +371,7 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, static int broadcast(gpudata *array, size_t offset, size_t count, int typecode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclDataType_t datatype = ncclNumTypes; int rank = 0; cuda_context *ctx; @@ -411,7 +411,7 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; @@ -439,8 +439,8 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest, // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR( - ctx, ncclAllGather((void *)(src->ptr + offsrc), count, datatype, - (void *)(dest->ptr + offdest), comm->c, ctx->s)); + ctx, ncclAllGather((void *)(src->ptr + offsrc), + (void *)(dest->ptr + offdest), count, datatype, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 5411949b20..36e009df05 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -208,10 +208,10 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, } for (j = 0; j < n; j++) { if (is_array(args[j])) { - strb_appendf(&sb, "%s %s;", ctype(ISSET(gen_flags, GEN_CONVERT_F16) && args[j].typecode == GA_HALF ? + strb_appendf(&sb, "%s %s;", ctype(/* ISSET(gen_flags, GEN_CONVERT_F16) && */ args[j].typecode == GA_HALF ? GA_FLOAT : args[j].typecode), args[j].name); if (ISSET(args[j].flags, GE_READ)) { - if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { + if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16)*/) { strb_appendf(&sb, "%s = load_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p));\n", args[j].name, args[j].name, args[j].name); } else { @@ -225,7 +225,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, strb_appends(&sb, ";\n"); for (j = 0; j < n; j++) { if (is_array(args[j]) && ISSET(args[j].flags, GE_WRITE)) { - if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { + if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16)*/) { strb_appendf(&sb, "store_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p), %s);\n", args[j].name, args[j].name, args[j].name); } else { @@ -502,10 +502,10 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, strb_appends(&sb, "for (i = idx; i < n; i += numThreads) {\n"); for (j = 0; j < n; j++) { if (is_array(args[j])) { - strb_appendf(&sb, "%s %s;\n", ctype(ISSET(gen_flags, GEN_CONVERT_F16) && args[j].typecode == GA_HALF ? + strb_appendf(&sb, "%s %s;\n", ctype(/* ISSET(gen_flags, GEN_CONVERT_F16) && */ args[j].typecode == GA_HALF ? GA_FLOAT : args[j].typecode), args[j].name); if (ISSET(args[j].flags, GE_READ)) { - if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { + if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16) */) { strb_appendf(&sb, "%s = load_half(&%s_p[i]);\n", args[j].name, args[j].name); } else { strb_appendf(&sb, "%s = %s_p[i];\n", args[j].name, args[j].name); @@ -519,7 +519,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, for (j = 0; j < n; j++) { if (is_array(args[j])) { if (ISSET(args[j].flags, GE_WRITE)) { - if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { + if (args[j].typecode == GA_HALF/* && ISSET(gen_flags, GEN_CONVERT_F16) */) { strb_appendf(&sb, "store_half(&%s_p[i], %s);\n", args[j].name, args[j].name); } else { strb_appendf(&sb, "%s_p[i] = %s;\n", args[j].name, args[j].name); diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn index 64de5dd88e..f60fc38c65 100644 --- a/src/loaders/libnccl.fn +++ b/src/loaders/libnccl.fn @@ -4,9 +4,8 @@ DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm)); DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count)); DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank)); DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result)); -DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, int count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, -cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclBcast, (void* buff, int count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, int count, ncclDataType_t datatype, void* recvbuff, ncclComm_t comm, cudaStream_t stream)); \ No newline at end of file +DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)); +DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream )); +DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); +DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream )); +DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)); \ No newline at end of file diff --git a/src/loaders/libnccl.h b/src/loaders/libnccl.h index 0139878c8f..7d70138e03 100644 --- a/src/loaders/libnccl.h +++ b/src/loaders/libnccl.h @@ -13,21 +13,23 @@ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; typedef enum { ncclSuccess = 0 } ncclResult_t; +/* Reduction operation selector */ typedef enum { ncclSum = 0, ncclProd = 1, ncclMax = 2, ncclMin = 3, - nccl_NUM_OPS = 4 } ncclRedOp_t; - + ncclNumOps = 4 } ncclRedOp_t; /* Data types */ -typedef enum { ncclChar = 0, - ncclInt = 1, - ncclHalf = 2, - ncclFloat = 3, - ncclDouble = 4, - ncclInt64 = 5, - ncclUint64 = 6, - nccl_NUM_TYPES = 7 } ncclDataType_t; +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, + ncclNumTypes = 9 } ncclDataType_t; /** @endcond */ diff --git a/src/util/error.c b/src/util/error.c index 19ce184363..24fb336e7f 100644 --- a/src/util/error.c +++ b/src/util/error.c @@ -31,12 +31,11 @@ int error_set(error *e, int code, const char *msg) { int error_fmt(error *e, int code, const char *fmt, ...) { va_list ap; - e->code = code; +#ifdef DEBUG va_start(ap, fmt); vsnprintf(e->msg, ERROR_MSGBUF_LEN, fmt, ap); va_end(ap); -#ifdef DEBUG fprintf(stderr, "ERROR %d: %s\n", e->code, e->msg); #endif return code;