diff --git a/src/gpuarray/ext_cuda.h b/src/gpuarray/ext_cuda.h
index 4b6377fa2b..4231c4f455 100644
--- a/src/gpuarray/ext_cuda.h
+++ b/src/gpuarray/ext_cuda.h
@@ -2,6 +2,7 @@
 #define LIBGPU_EXT_CUDA
 
 #include <cuda.h>
+#include <cuda_fp16.h>
 
 #include <gpuarray/config.h>
 #include <gpuarray/buffer.h>
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index 071ec9700d..cd362f985c 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -2,6 +2,7 @@
 
 #include "private.h"
 #include "private_cuda.h"
+
 #include "loaders/libnvrtc.h"
 #include "loaders/libcublas.h"
 
@@ -465,6 +466,24 @@ size_t cuda_get_sz(gpudata *g) { ASSERT_BUF(g); return g->sz; }
   }
 
 static const char CUDA_PREAMBLE[] =
+    "typedef struct { unsigned short x; } __half;\n"
+    "#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))\n"
+    "#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))\n"
+    "#if __CUDACC_VER_MAJOR__ >= 9\n"
+    "#define __CUDA_FP16_DECL__ static __device__ __forceinline__\n"
+    "__CUDA_FP16_DECL__ __half __float2half_rn(const float f) \n"
+    "{\n"
+    "  __half val;\n"
+    "  asm(\"{  cvt.rn.f16.f32 %0, %1;}\\n\" : \"=h\"(__HALF_TO_US(val)) : \"f\"(f));\n"
+    "  return val;\n"
+    "}\n"
+    "__CUDA_FP16_DECL__ float __half2float(const __half h)\n"
+    "{\n"
+    "  float val;\n"
+    "  asm(\"{  cvt.f32.f16 %0, %1;}\\n\" : \"=f\"(val) : \"h\"(__HALF_TO_CUS(h)));\n"
+    "  return val;\n"
+    "}\n "
+    "#endif\n"
     "#define local_barrier() __syncthreads()\n"
     "#define WITHIN_KERNEL extern \"C\" __device__\n"
     "#define KERNEL extern \"C\" __global__\n"
@@ -502,11 +521,17 @@ static const char CUDA_PREAMBLE[] =
     "#define ga_ulong unsigned long long\n"
     "#define ga_float float\n"
     "#define ga_double double\n"
-    "#define ga_half ga_ushort\n"
+    "#define ga_half __half\n"
     "#define ga_size size_t\n"
     "#define ga_ssize ptrdiff_t\n"
-    "#define load_half(p) __half2float(*(p))\n"
-    "#define store_half(p, v) (*(p) = __float2half_rn(v))\n"
+    "#define load_half(p) __half2float(*((__half*)p))\n"
+    "#if __CUDACC_VER_MAJOR__ >= 9\n"
+    "# define store_half(p, v) (*((__half*)p) = __float2half_rn(v))\n"
+    "# define __float2half_as_us(v) __float2half_rn(v).x\n"
+    "#else\n"
+    "# define store_half(p, v) (*((unsigned short*)p) = __float2half_rn(v))\n"
+    "# define __float2half_as_us __float2half_rn\n"
+    "#endif\n"
     "#define GA_DECL_SHARED_PARAM(type, name)\n"
     "#define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[];\n"
     "#define GA_WARP_SIZE warpSize\n"
@@ -1098,9 +1123,11 @@ static inline int error_nvrtc(error *e, const char *msg, nvrtcResult err) {
 static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
   nvrtcProgram prog;
   size_t buflen;
-  const char *opts[4] = {
+  const char *opts[] = {
     "-arch", ""
+#ifdef DEBUG
     , "-G", "-lineinfo"
+#endif
   };
   nvrtcResult err;
 
@@ -1111,11 +1138,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
     return error_nvrtc(ctx->err, "nvrtcCreateProgram", err);
 
   err = nvrtcCompileProgram(prog,
-#ifdef DEBUG
-                            4,
-#else
-                            2,
-#endif
+			    sizeof(opts)/sizeof(char*),
                             opts);
 
   /* Get the log before handling the error */
diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c
index 99cd5f7e38..de80b715b6 100644
--- a/src/gpuarray_collectives_cuda_nccl.c
+++ b/src/gpuarray_collectives_cuda_nccl.c
@@ -153,7 +153,7 @@ static int get_rank(const gpucomm *comm, int *rank) {
  * \ref
  * ncclRedOp_t.
  *
- * If invalid, return `nccl_NUM_OPS`.
+ * If invalid, return `ncclNumOps`.
  */
 static inline ncclRedOp_t convert_reduce_op(int opcode) {
   switch (opcode) {
@@ -162,14 +162,14 @@ static inline ncclRedOp_t convert_reduce_op(int opcode) {
   case GA_MAX: return ncclMax;
   case GA_MIN: return ncclMin;
   }
-  return nccl_NUM_OPS;
+  return ncclNumOps;
 }
 
 /**
  * \brief Helper function to try to convert \ref enum GPUARRAY_TYPES to \ref
  * ncclDataType_t.
  *
- * If invalid, return `nccl_NUM_TYPES`.
+ * If invalid, return `ncclNumTypes`.
  */
 static inline ncclDataType_t convert_data_type(int typecode) {
   switch (typecode) {
@@ -181,7 +181,7 @@ static inline ncclDataType_t convert_data_type(int typecode) {
   case GA_ULONG: return ncclUint64;
   case GA_HALF: return ncclHalf;
   }
-  return nccl_NUM_TYPES;
+  return ncclNumTypes;
 }
 
 /**
@@ -208,13 +208,13 @@ static inline int check_restrictions(gpudata *src, size_t offsrc,
   // typecode must correspond to a valid ncclDataType_t
   if (datatype != NULL) {
     *datatype = convert_data_type(typecode);
-    if (*datatype == nccl_NUM_TYPES)
+    if (*datatype == ncclNumTypes)
       return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type");
   }
   // opcode must correspond to a valid ncclRedOp_t
   if (op != NULL) {
     *op = convert_reduce_op(opcode);
-    if (*op == nccl_NUM_OPS)
+    if (*op == ncclNumOps)
       return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op");
   }
   // offsets must not be larger than gpudata's size itself
@@ -237,8 +237,8 @@ static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest,
                   size_t count, int typecode, int opcode, int root,
                   gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclRedOp_t op = nccl_NUM_OPS;
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
   gpudata *dst = NULL;
   int rank = 0;
   cuda_context *ctx;
@@ -287,8 +287,8 @@ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest,
                       size_t offdest, size_t count, int typecode, int opcode,
                       gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclRedOp_t op = nccl_NUM_OPS;
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
   cuda_context *ctx;
 
   ASSERT_BUF(src);
@@ -325,8 +325,8 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
                           size_t offdest, size_t count, int typecode,
                           int opcode, gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclRedOp_t op = nccl_NUM_OPS;
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
   int ndev = 0;
   size_t resc_size;
   cuda_context *ctx;
@@ -371,7 +371,7 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
 static int broadcast(gpudata *array, size_t offset, size_t count, int typecode,
                      int root, gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclDataType_t datatype = ncclNumTypes;
   int rank = 0;
   cuda_context *ctx;
 
@@ -411,7 +411,7 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest,
                       size_t offdest, size_t count, int typecode,
                       gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclDataType_t datatype = ncclNumTypes;
   int ndev = 0;
   size_t resc_size;
   cuda_context *ctx;
@@ -439,8 +439,8 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest,
 
   // change stream of nccl ops to enable concurrency
   NCCL_EXIT_ON_ERROR(
-      ctx, ncclAllGather((void *)(src->ptr + offsrc), count, datatype,
-                         (void *)(dest->ptr + offdest), comm->c, ctx->s));
+      ctx, ncclAllGather((void *)(src->ptr + offsrc),
+			 (void *)(dest->ptr + offdest), count, datatype, comm->c, ctx->s));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));
diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c
index 5411949b20..36e009df05 100644
--- a/src/gpuarray_elemwise.c
+++ b/src/gpuarray_elemwise.c
@@ -208,10 +208,10 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx,
   }
   for (j = 0; j < n; j++) {
     if (is_array(args[j])) {
-      strb_appendf(&sb, "%s %s;", ctype(ISSET(gen_flags, GEN_CONVERT_F16) && args[j].typecode == GA_HALF ?
+      strb_appendf(&sb, "%s %s;", ctype(/* ISSET(gen_flags, GEN_CONVERT_F16) && */ args[j].typecode == GA_HALF ?
                                         GA_FLOAT : args[j].typecode), args[j].name);
       if (ISSET(args[j].flags, GE_READ)) {
-        if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
+        if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16)*/) {
           strb_appendf(&sb, "%s = load_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p));\n",
                        args[j].name, args[j].name, args[j].name);
         } else {
@@ -225,7 +225,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx,
   strb_appends(&sb, ";\n");
   for (j = 0; j < n; j++) {
     if (is_array(args[j]) && ISSET(args[j].flags, GE_WRITE)) {
-      if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
+      if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16)*/) {
         strb_appendf(&sb, "store_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p), %s);\n",
                      args[j].name, args[j].name, args[j].name);
       } else {
@@ -502,10 +502,10 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
   strb_appends(&sb, "for (i = idx; i < n; i += numThreads) {\n");
   for (j = 0; j < n; j++) {
     if (is_array(args[j])) {
-      strb_appendf(&sb, "%s %s;\n", ctype(ISSET(gen_flags, GEN_CONVERT_F16) && args[j].typecode == GA_HALF ?
+      strb_appendf(&sb, "%s %s;\n", ctype(/* ISSET(gen_flags, GEN_CONVERT_F16) && */ args[j].typecode == GA_HALF ?
                                           GA_FLOAT : args[j].typecode), args[j].name);
       if (ISSET(args[j].flags, GE_READ)) {
-        if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
+        if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16) */) {
           strb_appendf(&sb, "%s = load_half(&%s_p[i]);\n", args[j].name, args[j].name);
         } else {
           strb_appendf(&sb, "%s = %s_p[i];\n", args[j].name, args[j].name);
@@ -519,7 +519,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
   for (j = 0; j < n; j++) {
     if (is_array(args[j])) {
       if (ISSET(args[j].flags, GE_WRITE)) {
-        if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
+        if (args[j].typecode == GA_HALF/* && ISSET(gen_flags, GEN_CONVERT_F16) */) {
           strb_appendf(&sb, "store_half(&%s_p[i], %s);\n", args[j].name, args[j].name);
         } else {
           strb_appendf(&sb, "%s_p[i] = %s;\n", args[j].name, args[j].name);
diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn
index 64de5dd88e..f60fc38c65 100644
--- a/src/loaders/libnccl.fn
+++ b/src/loaders/libnccl.fn
@@ -4,9 +4,8 @@ DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm));
 DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count));
 DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank));
 DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result));
-DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, int count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclBcast, (void* buff, int count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, int count, ncclDataType_t datatype, void* recvbuff, ncclComm_t comm, cudaStream_t stream));
\ No newline at end of file
+DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream));
+DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream ));
+DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
+DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream ));
+DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream));
\ No newline at end of file
diff --git a/src/loaders/libnccl.h b/src/loaders/libnccl.h
index 0139878c8f..7d70138e03 100644
--- a/src/loaders/libnccl.h
+++ b/src/loaders/libnccl.h
@@ -13,21 +13,23 @@ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
 
 typedef enum { ncclSuccess = 0 } ncclResult_t;
 
+/* Reduction operation selector */
 typedef enum { ncclSum        = 0,
                ncclProd       = 1,
                ncclMax        = 2,
                ncclMin        = 3,
-               nccl_NUM_OPS   = 4 } ncclRedOp_t;
-
+               ncclNumOps     = 4 } ncclRedOp_t;
 /* Data types */
-typedef enum { ncclChar       = 0,
-               ncclInt        = 1,
-               ncclHalf       = 2,
-               ncclFloat      = 3,
-               ncclDouble     = 4,
-               ncclInt64      = 5,
-               ncclUint64     = 6,
-               nccl_NUM_TYPES = 7 } ncclDataType_t;
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclNumTypes   = 9 } ncclDataType_t;
 
 /** @endcond */
 
diff --git a/src/util/error.c b/src/util/error.c
index 19ce184363..24fb336e7f 100644
--- a/src/util/error.c
+++ b/src/util/error.c
@@ -31,12 +31,11 @@ int error_set(error *e, int code, const char *msg) {
 
 int error_fmt(error *e, int code, const char *fmt, ...) {
   va_list ap;
-
   e->code = code;
+#ifdef DEBUG
   va_start(ap, fmt);
   vsnprintf(e->msg, ERROR_MSGBUF_LEN, fmt, ap);
   va_end(ap);
-#ifdef DEBUG
   fprintf(stderr, "ERROR %d: %s\n", e->code, e->msg);
 #endif
   return code;