Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed fp16 breakage caused by CUDA9 changes #485

Closed
wants to merge 10 commits into from
9 changes: 9 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@ PROJECT(libgpuarray C)

set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/")

FIND_PACKAGE(CUDA 8.0)

#for Travis
if (NOT CUDA_FOUND)
set(CUDA_VERSION_MAJOR 8)
endif()

set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DCUDA_VERSION_MAJOR=${CUDA_VERSION_MAJOR}")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't want to depend on an installed version of cuda to compile.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why ? Would you ever build Theano on CUDA8 system and run on CUDA9 ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, you're right - no need to introduce build time dependency here.


# -Wall is unbelieveably noisy with Visual Studio:
# https://stackoverflow.com/q/4001736/3257826
if(MSVC)
Expand Down
1 change: 1 addition & 0 deletions src/gpuarray/ext_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define LIBGPU_EXT_CUDA

#include <cuda.h>
#include <cuda_fp16.h>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need this include here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, if we have fp16-related code anywhere, cuda_fp16_h should be thought of as extension of cuda.h

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok


#include <gpuarray/config.h>
#include <gpuarray/buffer.h>
Expand Down
35 changes: 26 additions & 9 deletions src/gpuarray_buffer_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "private.h"
#include "private_cuda.h"

#include "loaders/libnvrtc.h"
#include "loaders/libcublas.h"

Expand Down Expand Up @@ -465,6 +466,24 @@ size_t cuda_get_sz(gpudata *g) { ASSERT_BUF(g); return g->sz; }
}

static const char CUDA_PREAMBLE[] =
#if CUDA_VERSION_MAJOR >= 9
"#define __CUDA_FP16_DECL__ static __device__ __forceinline__\n"
"typedef struct { unsigned short __x; } __half;\n"
"#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))\n"
"#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))\n"
"__CUDA_FP16_DECL__ __half __float2half_rn(const float f) \n"
"{\n"
" __half val;\n"
" asm(\"{ cvt.rn.f16.f32 %0, %1;}\\n\" : \"=h\"(__HALF_TO_US(val)) : \"f\"(f));\n"
" return val;\n"
"}\n"
"__CUDA_FP16_DECL__ float __half2float(const __half h)\n"
"{\n"
" float val;\n"
" asm(\"{ cvt.f32.f16 %0, %1;}\\n\" : \"=f\"(val) : \"h\"(__HALF_TO_CUS(h)));\n"
" return val;\n"
"}\n"
#endif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would have to be detected at runtime by consulting the context.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes runtime is possible, but, again - why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed - the condition is not the context though, but the RTC version.

"#define local_barrier() __syncthreads()\n"
"#define WITHIN_KERNEL extern \"C\" __device__\n"
"#define KERNEL extern \"C\" __global__\n"
Expand Down Expand Up @@ -502,11 +521,11 @@ static const char CUDA_PREAMBLE[] =
"#define ga_ulong unsigned long long\n"
"#define ga_float float\n"
"#define ga_double double\n"
"#define ga_half ga_ushort\n"
"#define ga_half __half\n"
"#define ga_size size_t\n"
"#define ga_ssize ptrdiff_t\n"
"#define load_half(p) __half2float(*(p))\n"
"#define store_half(p, v) (*(p) = __float2half_rn(v))\n"
"#define load_half(p) __half2float(*((__half*)p))\n"
"#define store_half(p, v) (*((__half*)p) = __float2half_rn(v))\n"
"#define GA_DECL_SHARED_PARAM(type, name)\n"
"#define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[];\n"
"#define GA_WARP_SIZE warpSize\n"
Expand Down Expand Up @@ -1098,9 +1117,11 @@ static inline int error_nvrtc(error *e, const char *msg, nvrtcResult err) {
static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
nvrtcProgram prog;
size_t buflen;
const char *opts[4] = {
const char *opts[] = {
"-arch", ""
#ifdef DEBUG
, "-G", "-lineinfo"
#endif
};
nvrtcResult err;

Expand All @@ -1111,11 +1132,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
return error_nvrtc(ctx->err, "nvrtcCreateProgram", err);

err = nvrtcCompileProgram(prog,
#ifdef DEBUG
4,
#else
2,
#endif
sizeof(opts)/sizeof(char*),
opts);

/* Get the log before handling the error */
Expand Down
32 changes: 16 additions & 16 deletions src/gpuarray_collectives_cuda_nccl.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ static int get_rank(const gpucomm *comm, int *rank) {
* \ref
* ncclRedOp_t.
*
* If invalid, return `nccl_NUM_OPS`.
* If invalid, return `ncclNumOps`.
*/
static inline ncclRedOp_t convert_reduce_op(int opcode) {
switch (opcode) {
Expand All @@ -162,14 +162,14 @@ static inline ncclRedOp_t convert_reduce_op(int opcode) {
case GA_MAX: return ncclMax;
case GA_MIN: return ncclMin;
}
return nccl_NUM_OPS;
return ncclNumOps;
}

/**
* \brief Helper function to try to convert \ref enum GPUARRAY_TYPES to \ref
* ncclDataType_t.
*
* If invalid, return `nccl_NUM_TYPES`.
* If invalid, return `ncclNumTypes`.
*/
static inline ncclDataType_t convert_data_type(int typecode) {
switch (typecode) {
Expand All @@ -181,7 +181,7 @@ static inline ncclDataType_t convert_data_type(int typecode) {
case GA_ULONG: return ncclUint64;
case GA_HALF: return ncclHalf;
}
return nccl_NUM_TYPES;
return ncclNumTypes;
}

/**
Expand All @@ -208,13 +208,13 @@ static inline int check_restrictions(gpudata *src, size_t offsrc,
// typecode must correspond to a valid ncclDataType_t
if (datatype != NULL) {
*datatype = convert_data_type(typecode);
if (*datatype == nccl_NUM_TYPES)
if (*datatype == ncclNumTypes)
return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type");
}
// opcode must correspond to a valid ncclRedOp_t
if (op != NULL) {
*op = convert_reduce_op(opcode);
if (*op == nccl_NUM_OPS)
if (*op == ncclNumOps)
return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op");
}
// offsets must not be larger than gpudata's size itself
Expand All @@ -237,8 +237,8 @@ static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest,
size_t count, int typecode, int opcode, int root,
gpucomm *comm) {
// need dummy init so that compiler shuts up
ncclRedOp_t op = nccl_NUM_OPS;
ncclDataType_t datatype = nccl_NUM_TYPES;
ncclRedOp_t op = ncclNumOps;
ncclDataType_t datatype = ncclNumTypes;
gpudata *dst = NULL;
int rank = 0;
cuda_context *ctx;
Expand Down Expand Up @@ -287,8 +287,8 @@ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest,
size_t offdest, size_t count, int typecode, int opcode,
gpucomm *comm) {
// need dummy init so that compiler shuts up
ncclRedOp_t op = nccl_NUM_OPS;
ncclDataType_t datatype = nccl_NUM_TYPES;
ncclRedOp_t op = ncclNumOps;
ncclDataType_t datatype = ncclNumTypes;
cuda_context *ctx;

ASSERT_BUF(src);
Expand Down Expand Up @@ -325,8 +325,8 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
size_t offdest, size_t count, int typecode,
int opcode, gpucomm *comm) {
// need dummy init so that compiler shuts up
ncclRedOp_t op = nccl_NUM_OPS;
ncclDataType_t datatype = nccl_NUM_TYPES;
ncclRedOp_t op = ncclNumOps;
ncclDataType_t datatype = ncclNumTypes;
int ndev = 0;
size_t resc_size;
cuda_context *ctx;
Expand Down Expand Up @@ -371,7 +371,7 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
static int broadcast(gpudata *array, size_t offset, size_t count, int typecode,
int root, gpucomm *comm) {
// need dummy init so that compiler shuts up
ncclDataType_t datatype = nccl_NUM_TYPES;
ncclDataType_t datatype = ncclNumTypes;
int rank = 0;
cuda_context *ctx;

Expand Down Expand Up @@ -411,7 +411,7 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest,
size_t offdest, size_t count, int typecode,
gpucomm *comm) {
// need dummy init so that compiler shuts up
ncclDataType_t datatype = nccl_NUM_TYPES;
ncclDataType_t datatype = ncclNumTypes;
int ndev = 0;
size_t resc_size;
cuda_context *ctx;
Expand Down Expand Up @@ -439,8 +439,8 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest,

// change stream of nccl ops to enable concurrency
NCCL_EXIT_ON_ERROR(
ctx, ncclAllGather((void *)(src->ptr + offsrc), count, datatype,
(void *)(dest->ptr + offdest), comm->c, ctx->s));
ctx, ncclAllGather((void *)(src->ptr + offsrc),
(void *)(dest->ptr + offdest), count, datatype, comm->c, ctx->s));

GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));
Expand Down
12 changes: 6 additions & 6 deletions src/gpuarray_elemwise.c
Original file line number Diff line number Diff line change
Expand Up @@ -208,10 +208,10 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx,
}
for (j = 0; j < n; j++) {
if (is_array(args[j])) {
strb_appendf(&sb, "%s %s;", ctype(ISSET(gen_flags, GEN_CONVERT_F16) && args[j].typecode == GA_HALF ?
strb_appendf(&sb, "%s %s;", ctype(/* ISSET(gen_flags, GEN_CONVERT_F16) && */ args[j].typecode == GA_HALF ?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you disabling the flag here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was getting some test errors - since half is now defined as struct, there are no cases when explicit conversion is not needed - if I understood this flag correctly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And yes test errors went away since then. I even ran nose-test with floatX=float16 with very few errors.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The intent of the flag is to choose between

  • store float16, compute float32
    and
  • store float16, compute float16.

In the latter case I'll admit that I just let the compiler figure out how to do that and it never worked properly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those tests would fail if not removing ISSET:
ERROR: test_long (theano.gpuarray.tests.test_subtensor.G_subtensorF16)
ERROR: test_inc_and_set_subtensor (theano.gpuarray.tests.test_subtensor.G_subtensorF16)
ERROR: test_ellipsis (theano.gpuarray.tests.test_subtensor.G_subtensorF16)
ERROR: test_advanced1_inc_and_set (theano.gpuarray.tests.test_subtensor.G_subtensorF16)
ERROR: test2_ok_strided (theano.gpuarray.tests.test_subtensor.G_subtensorF16)
ERROR: test2_ok_rows_finite (theano.gpuarray.tests.test_subtensor.G_subtensorF16)
ERROR: test2_ok_range_finite (theano.gpuarray.tests.test_subtensor.G_subtensorF16)
ERROR: test2_ok_col (theano.gpuarray.tests.test_subtensor.G_subtensorF16)
ERROR: test1_ok_strided (theano.gpuarray.tests.test_subtensor.G_subtensorF16)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I'll have to take a look at this to figure out the proper solution. I don't have much time to do it this week so this may wait a while.

I'm leaning towards adding a new type for native f16 compute and keeping the existing type for f32 compute.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have to warn you using native fp16 is almost never a good idea - we do not do it in most frameworks due to the precision issues, and in many cases it is also slower. Implementing the switch via separate types was also tried and proved more trouble than worth - better to use single storage type and have a switch. If you return to this - Volta also has FP16 HMMA (compute f16 with f32 accumulator) - all three maths (pseudo f16, native f16, f16 hmma) are slightly different, so plan in advance :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we will stick with float16 meaning f32 compute for now then and nothing for native float16. In any case that would allow for removing the flag.

GA_FLOAT : args[j].typecode), args[j].name);
if (ISSET(args[j].flags, GE_READ)) {
if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16)*/) {
strb_appendf(&sb, "%s = load_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p));\n",
args[j].name, args[j].name, args[j].name);
} else {
Expand All @@ -225,7 +225,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx,
strb_appends(&sb, ";\n");
for (j = 0; j < n; j++) {
if (is_array(args[j]) && ISSET(args[j].flags, GE_WRITE)) {
if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16)*/) {
strb_appendf(&sb, "store_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p), %s);\n",
args[j].name, args[j].name, args[j].name);
} else {
Expand Down Expand Up @@ -484,10 +484,10 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
strb_appends(&sb, "for (i = idx; i < n; i += numThreads) {\n");
for (j = 0; j < n; j++) {
if (is_array(args[j])) {
strb_appendf(&sb, "%s %s;\n", ctype(ISSET(gen_flags, GEN_CONVERT_F16) && args[j].typecode == GA_HALF ?
strb_appendf(&sb, "%s %s;\n", ctype(/* ISSET(gen_flags, GEN_CONVERT_F16) && */ args[j].typecode == GA_HALF ?
GA_FLOAT : args[j].typecode), args[j].name);
if (ISSET(args[j].flags, GE_READ)) {
if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16) */) {
strb_appendf(&sb, "%s = load_half(&%s_p[i]);\n", args[j].name, args[j].name);
} else {
strb_appendf(&sb, "%s = %s_p[i];\n", args[j].name, args[j].name);
Expand All @@ -501,7 +501,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
for (j = 0; j < n; j++) {
if (is_array(args[j])) {
if (ISSET(args[j].flags, GE_WRITE)) {
if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
if (args[j].typecode == GA_HALF/* && ISSET(gen_flags, GEN_CONVERT_F16) */) {
strb_appendf(&sb, "store_half(&%s_p[i], %s);\n", args[j].name, args[j].name);
} else {
strb_appendf(&sb, "%s_p[i] = %s;\n", args[j].name, args[j].name);
Expand Down
11 changes: 5 additions & 6 deletions src/loaders/libnccl.fn
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@ DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm));
DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count));
DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank));
DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result));
DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream));
DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, int count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
cudaStream_t stream));
DEF_PROC(ncclResult_t, ncclBcast, (void* buff, int count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream));
DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, int count, ncclDataType_t datatype, void* recvbuff, ncclComm_t comm, cudaStream_t stream));
DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream));
DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream ));
DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream ));
DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nccl Got updated to use size_t now? Is there a way to detect that when we load the library. I would like to prevent people from loading the older one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, those are nccl 2.0 definitions. I think, if you #include 'nccl.h' as well, you will get compiler errors if definitions are not the same.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These definitions are used to dlopen the library and grab some function pointers. There won't be any compiler double checking our work, so we need to be careful.

That being said, I have no issues with dropping support for nccl 1.0 and blocking the load in that case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why, if you #include nccl.h in .cpp, and then include your .fn with proper expansion, you would end up with 2 sets of extern function definitions. If they won't match, compile would break.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We never include the real nccl.h anywhere.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@abergeron : my sentiment, exactly: should you include it as I suggested above, you would be able to detect API change.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't want to include it because I want to be able to build on machines where it is not present and then load it if later installed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, this is important. Could be an optional target only.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One way to do this might be to add one of the new group API to the set of required functions. This will make the load fail for version 1.0, which should prevent problems of the sort.

22 changes: 12 additions & 10 deletions src/loaders/libnccl.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,23 @@ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;

typedef enum { ncclSuccess = 0 } ncclResult_t;

/* Reduction operation selector */
typedef enum { ncclSum = 0,
ncclProd = 1,
ncclMax = 2,
ncclMin = 3,
nccl_NUM_OPS = 4 } ncclRedOp_t;

ncclNumOps = 4 } ncclRedOp_t;
/* Data types */
typedef enum { ncclChar = 0,
ncclInt = 1,
ncclHalf = 2,
ncclFloat = 3,
ncclDouble = 4,
ncclInt64 = 5,
ncclUint64 = 6,
nccl_NUM_TYPES = 7 } ncclDataType_t;
typedef enum { ncclInt8 = 0, ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2, ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6, ncclHalf = 6,
ncclFloat32 = 7, ncclFloat = 7,
ncclFloat64 = 8, ncclDouble = 8,
ncclNumTypes = 9 } ncclDataType_t;

/** @endcond */

Expand Down