-
Notifications
You must be signed in to change notification settings - Fork 95
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixed fp16 breakage caused by CUDA9 changes #485
Changes from 4 commits
86a66e1
e98fd37
208caeb
ec34534
c9b9457
b50a934
c2a345e
4e269d5
dd1675c
92cbe7c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
#define LIBGPU_EXT_CUDA | ||
|
||
#include <cuda.h> | ||
#include <cuda_fp16.h> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you need this include here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, if we have fp16-related code anywhere, cuda_fp16_h should be thought of as extension of cuda.h There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok |
||
|
||
#include <gpuarray/config.h> | ||
#include <gpuarray/buffer.h> | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
|
||
#include "private.h" | ||
#include "private_cuda.h" | ||
|
||
#include "loaders/libnvrtc.h" | ||
#include "loaders/libcublas.h" | ||
|
||
|
@@ -465,6 +466,24 @@ size_t cuda_get_sz(gpudata *g) { ASSERT_BUF(g); return g->sz; } | |
} | ||
|
||
static const char CUDA_PREAMBLE[] = | ||
#if CUDA_VERSION_MAJOR >= 9 | ||
"#define __CUDA_FP16_DECL__ static __device__ __forceinline__\n" | ||
"typedef struct { unsigned short __x; } __half;\n" | ||
"#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))\n" | ||
"#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))\n" | ||
"__CUDA_FP16_DECL__ __half __float2half_rn(const float f) \n" | ||
"{\n" | ||
" __half val;\n" | ||
" asm(\"{ cvt.rn.f16.f32 %0, %1;}\\n\" : \"=h\"(__HALF_TO_US(val)) : \"f\"(f));\n" | ||
" return val;\n" | ||
"}\n" | ||
"__CUDA_FP16_DECL__ float __half2float(const __half h)\n" | ||
"{\n" | ||
" float val;\n" | ||
" asm(\"{ cvt.f32.f16 %0, %1;}\\n\" : \"=f\"(val) : \"h\"(__HALF_TO_CUS(h)));\n" | ||
" return val;\n" | ||
"}\n" | ||
#endif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would have to be detected at runtime by consulting the context. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes runtime is possible, but, again - why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed - the condition is not the context though, but the RTC version. |
||
"#define local_barrier() __syncthreads()\n" | ||
"#define WITHIN_KERNEL extern \"C\" __device__\n" | ||
"#define KERNEL extern \"C\" __global__\n" | ||
|
@@ -502,11 +521,11 @@ static const char CUDA_PREAMBLE[] = | |
"#define ga_ulong unsigned long long\n" | ||
"#define ga_float float\n" | ||
"#define ga_double double\n" | ||
"#define ga_half ga_ushort\n" | ||
"#define ga_half __half\n" | ||
"#define ga_size size_t\n" | ||
"#define ga_ssize ptrdiff_t\n" | ||
"#define load_half(p) __half2float(*(p))\n" | ||
"#define store_half(p, v) (*(p) = __float2half_rn(v))\n" | ||
"#define load_half(p) __half2float(*((__half*)p))\n" | ||
"#define store_half(p, v) (*((__half*)p) = __float2half_rn(v))\n" | ||
"#define GA_DECL_SHARED_PARAM(type, name)\n" | ||
"#define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[];\n" | ||
"#define GA_WARP_SIZE warpSize\n" | ||
|
@@ -1098,9 +1117,11 @@ static inline int error_nvrtc(error *e, const char *msg, nvrtcResult err) { | |
static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { | ||
nvrtcProgram prog; | ||
size_t buflen; | ||
const char *opts[4] = { | ||
const char *opts[] = { | ||
"-arch", "" | ||
#ifdef DEBUG | ||
, "-G", "-lineinfo" | ||
#endif | ||
}; | ||
nvrtcResult err; | ||
|
||
|
@@ -1111,11 +1132,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { | |
return error_nvrtc(ctx->err, "nvrtcCreateProgram", err); | ||
|
||
err = nvrtcCompileProgram(prog, | ||
#ifdef DEBUG | ||
4, | ||
#else | ||
2, | ||
#endif | ||
sizeof(opts)/sizeof(char*), | ||
opts); | ||
|
||
/* Get the log before handling the error */ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -208,10 +208,10 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, | |
} | ||
for (j = 0; j < n; j++) { | ||
if (is_array(args[j])) { | ||
strb_appendf(&sb, "%s %s;", ctype(ISSET(gen_flags, GEN_CONVERT_F16) && args[j].typecode == GA_HALF ? | ||
strb_appendf(&sb, "%s %s;", ctype(/* ISSET(gen_flags, GEN_CONVERT_F16) && */ args[j].typecode == GA_HALF ? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are you disabling the flag here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was getting some test errors - since half is now defined as struct, there are no cases when explicit conversion is not needed - if I understood this flag correctly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And yes test errors went away since then. I even ran nose-test with floatX=float16 with very few errors. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The intent of the flag is to choose between
In the latter case I'll admit that I just let the compiler figure out how to do that and it never worked properly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Those tests would fail if not removing ISSET: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I'll have to take a look at this to figure out the proper solution. I don't have much time to do it this week so this may wait a while. I'm leaning towards adding a new type for native f16 compute and keeping the existing type for f32 compute. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have to warn you using native fp16 is almost never a good idea - we do not do it in most frameworks due to the precision issues, and in many cases it is also slower. Implementing the switch via separate types was also tried and proved more trouble than worth - better to use single storage type and have a switch. If you return to this - Volta also has FP16 HMMA (compute f16 with f32 accumulator) - all three maths (pseudo f16, native f16, f16 hmma) are slightly different, so plan in advance :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we will stick with float16 meaning f32 compute for now then and nothing for native float16. In any case that would allow for removing the flag. |
||
GA_FLOAT : args[j].typecode), args[j].name); | ||
if (ISSET(args[j].flags, GE_READ)) { | ||
if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { | ||
if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16)*/) { | ||
strb_appendf(&sb, "%s = load_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p));\n", | ||
args[j].name, args[j].name, args[j].name); | ||
} else { | ||
|
@@ -225,7 +225,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, | |
strb_appends(&sb, ";\n"); | ||
for (j = 0; j < n; j++) { | ||
if (is_array(args[j]) && ISSET(args[j].flags, GE_WRITE)) { | ||
if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { | ||
if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16)*/) { | ||
strb_appendf(&sb, "store_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p), %s);\n", | ||
args[j].name, args[j].name, args[j].name); | ||
} else { | ||
|
@@ -484,10 +484,10 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, | |
strb_appends(&sb, "for (i = idx; i < n; i += numThreads) {\n"); | ||
for (j = 0; j < n; j++) { | ||
if (is_array(args[j])) { | ||
strb_appendf(&sb, "%s %s;\n", ctype(ISSET(gen_flags, GEN_CONVERT_F16) && args[j].typecode == GA_HALF ? | ||
strb_appendf(&sb, "%s %s;\n", ctype(/* ISSET(gen_flags, GEN_CONVERT_F16) && */ args[j].typecode == GA_HALF ? | ||
GA_FLOAT : args[j].typecode), args[j].name); | ||
if (ISSET(args[j].flags, GE_READ)) { | ||
if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { | ||
if (args[j].typecode == GA_HALF /* && ISSET(gen_flags, GEN_CONVERT_F16) */) { | ||
strb_appendf(&sb, "%s = load_half(&%s_p[i]);\n", args[j].name, args[j].name); | ||
} else { | ||
strb_appendf(&sb, "%s = %s_p[i];\n", args[j].name, args[j].name); | ||
|
@@ -501,7 +501,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, | |
for (j = 0; j < n; j++) { | ||
if (is_array(args[j])) { | ||
if (ISSET(args[j].flags, GE_WRITE)) { | ||
if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { | ||
if (args[j].typecode == GA_HALF/* && ISSET(gen_flags, GEN_CONVERT_F16) */) { | ||
strb_appendf(&sb, "store_half(&%s_p[i], %s);\n", args[j].name, args[j].name); | ||
} else { | ||
strb_appendf(&sb, "%s_p[i] = %s;\n", args[j].name, args[j].name); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,9 +4,8 @@ DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm)); | |
DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count)); | ||
DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank)); | ||
DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result)); | ||
DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)); | ||
DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, int count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); | ||
DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, | ||
cudaStream_t stream)); | ||
DEF_PROC(ncclResult_t, ncclBcast, (void* buff, int count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream)); | ||
DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, int count, ncclDataType_t datatype, void* recvbuff, ncclComm_t comm, cudaStream_t stream)); | ||
DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)); | ||
DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream )); | ||
DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); | ||
DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream )); | ||
DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nccl Got updated to use size_t now? Is there a way to detect that when we load the library. I would like to prevent people from loading the older one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, those are nccl 2.0 definitions. I think, if you #include 'nccl.h' as well, you will get compiler errors if definitions are not the same. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These definitions are used to dlopen the library and grab some function pointers. There won't be any compiler double checking our work, so we need to be careful. That being said, I have no issues with dropping support for nccl 1.0 and blocking the load in that case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why, if you #include nccl.h in .cpp, and then include your .fn with proper expansion, you would end up with 2 sets of extern function definitions. If they won't match, compile would break. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We never include the real nccl.h anywhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @abergeron : my sentiment, exactly: should you include it as I suggested above, you would be able to detect API change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't want to include it because I want to be able to build on machines where it is not present and then load it if later installed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, this is important. Could be an optional target only. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One way to do this might be to add one of the new group API to the set of required functions. This will make the load fail for version 1.0, which should prevent problems of the sort. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't want to depend on an installed version of cuda to compile.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why ? Would you ever build Theano on CUDA8 system and run on CUDA9 ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, you're right - no need to introduce build time dependency here.