Index: lib/Driver/ToolChains.cpp =================================================================== --- lib/Driver/ToolChains.cpp +++ lib/Driver/ToolChains.cpp @@ -1774,8 +1774,7 @@ Args.getLastArgValue(options::OPT_cuda_path_EQ)); else { CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda"); - // FIXME: Uncomment this once we can compile the cuda 8 headers. - // CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-8.0"); + CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-8.0"); CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.5"); CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.0"); } @@ -1795,6 +1794,16 @@ FS.exists(LibDevicePath))) continue; + llvm::ErrorOr> VersionFile = + FS.getBufferForFile(InstallPath + "/version.txt"); + if (!VersionFile) { + // CUDA 7.0 doesn't have a version.txt, so guess that's our version if + // version.txt isn't present. + Version = CudaVersion::CUDA_70; + } else { + Version = ParseCudaVersionFile((*VersionFile)->getBuffer()); + } + std::error_code EC; for (llvm::sys::fs::directory_iterator LI(LibDevicePath, EC), LE; !EC && LI != LE; LI = LI.increment(EC)) { @@ -1807,24 +1816,20 @@ StringRef GpuArch = FileName.slice( LibDeviceName.size(), FileName.find('.', LibDeviceName.size())); LibDeviceMap[GpuArch] = FilePath.str(); - // Insert map entries for specifc devices with this compute capability. - // NVCC's choice of libdevice library version is rather peculiar: - // http://docs.nvidia.com/cuda/libdevice-users-guide/basic-usage.html#version-selection - // TODO: this will need to be updated once CUDA-8 is released. + // Insert map entries for specifc devices with this compute + // capability. NVCC's choice of the libdevice library version is + // rather peculiar and depends on the CUDA version. if (GpuArch == "compute_20") { LibDeviceMap["sm_20"] = FilePath; LibDeviceMap["sm_21"] = FilePath; LibDeviceMap["sm_32"] = FilePath; } else if (GpuArch == "compute_30") { LibDeviceMap["sm_30"] = FilePath; - // compute_30 is the fallback libdevice variant for sm_30+, - // unless CUDA specifies different version for specific GPU - // arch. - LibDeviceMap["sm_50"] = FilePath; - LibDeviceMap["sm_52"] = FilePath; - LibDeviceMap["sm_53"] = FilePath; - // sm_6? are currently all aliases for sm_53 in LLVM and - // should use compute_30. + if (Version < CudaVersion::CUDA_80) { + LibDeviceMap["sm_50"] = FilePath; + LibDeviceMap["sm_52"] = FilePath; + LibDeviceMap["sm_53"] = FilePath; + } LibDeviceMap["sm_60"] = FilePath; LibDeviceMap["sm_61"] = FilePath; LibDeviceMap["sm_62"] = FilePath; @@ -1832,21 +1837,14 @@ LibDeviceMap["sm_35"] = FilePath; LibDeviceMap["sm_37"] = FilePath; } else if (GpuArch == "compute_50") { - // NVCC does not use compute_50 libdevice at all at the moment. - // The version that's shipped with CUDA-7.5 is a copy of compute_30. + if (Version >= CudaVersion::CUDA_80) { + LibDeviceMap["sm_50"] = FilePath; + LibDeviceMap["sm_52"] = FilePath; + LibDeviceMap["sm_53"] = FilePath; + } } } - llvm::ErrorOr> VersionFile = - FS.getBufferForFile(InstallPath + "/version.txt"); - if (!VersionFile) { - // CUDA 7.0 doesn't have a version.txt, so guess that's our version if - // version.txt isn't present. - Version = CudaVersion::CUDA_70; - } else { - Version = ParseCudaVersionFile((*VersionFile)->getBuffer()); - } - IsValid = true; break; } Index: lib/Headers/__clang_cuda_runtime_wrapper.h =================================================================== --- lib/Headers/__clang_cuda_runtime_wrapper.h +++ lib/Headers/__clang_cuda_runtime_wrapper.h @@ -62,7 +62,7 @@ #include "cuda.h" #if !defined(CUDA_VERSION) #error "cuda.h did not define CUDA_VERSION" -#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050 +#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000 #error "Unsupported CUDA version!" #endif @@ -113,6 +113,7 @@ #undef __cxa_vec_ctor #undef __cxa_vec_cctor #undef __cxa_vec_dtor +#undef __cxa_vec_new #undef __cxa_vec_new2 #undef __cxa_vec_new3 #undef __cxa_vec_delete2 @@ -135,6 +136,25 @@ // the headers we're about to include. #define __host__ UNEXPECTED_HOST_ATTRIBUTE +// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values +// Previous versions used to check thether they are defined or not. +// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it +// here to detect the switch. + +#if defined(CU_DEVICE_INVALID) +#if defined(__USE_FAST_MATH__) && __USE_FAST_MATH__ +#define __USE_FAST_MATH__ 1 +#else +#define __USE_FAST_MATH__ 0 +#endif + +#if defined(__CUDA_PREC_DIV) && __CUDA_PREC_DIV +#define __CUDA_PREC_DIV 1 +#else +#define __CUDA_PREC_DIV 0 +#endif +#endif + // device_functions.hpp and math_functions*.hpp use 'static // __forceinline__' (with no __device__) for definitions of device // functions. Temporarily redefine __forceinline__ to include @@ -151,7 +171,7 @@ // slow divides), so we need to scope our define carefully here. #pragma push_macro("__USE_FAST_MATH__") #if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__) -#define __USE_FAST_MATH__ +#define __USE_FAST_MATH__ 1 #endif #include "math_functions.hpp" #pragma pop_macro("__USE_FAST_MATH__") Index: test/Driver/cuda-detect.cu =================================================================== --- test/Driver/cuda-detect.cu +++ test/Driver/cuda-detect.cu @@ -22,13 +22,14 @@ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE20 -// sm_30, sm_5x and sm_6x map to compute_30 +// sm_30, sm_6x map to compute_30. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE30 +// sm_5x is a special case. Maps to compute_30 for cuda-7.x only. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \ -// RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ +// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE30 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_60 \ @@ -44,6 +45,12 @@ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE35 +// sm_5x -> compute_50 for CUDA-8.0 and newer. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \ +// RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix COMMON \ +// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE50 + // Verify that -nocudainc prevents adding include path to CUDA headers. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ @@ -56,8 +63,8 @@ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC // Verify that we get an error if there's no libdevice library to link with. -// NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_30 for this purpose. -// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \ +// NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_20 for this purpose. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_20 \ // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix MISSINGLIBDEVICE @@ -81,7 +88,7 @@ // CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA/usr/local/cuda // NOCUDA-NOT: Found CUDA installation: -// MISSINGLIBDEVICE: error: cannot find libdevice for sm_30. +// MISSINGLIBDEVICE: error: cannot find libdevice for sm_20. // COMMON: "-triple" "nvptx-nvidia-cuda" // COMMON-SAME: "-fcuda-is-device" @@ -90,6 +97,7 @@ // LIBDEVICE20-SAME: libdevice.compute_20.10.bc // LIBDEVICE30-SAME: libdevice.compute_30.10.bc // LIBDEVICE35-SAME: libdevice.compute_35.10.bc +// LIBDEVICE50-SAME: libdevice.compute_50.10.bc // NOLIBDEVICE-NOT: libdevice.compute_{{.*}}.bc // LIBDEVICE-SAME: "-target-feature" "+ptx42" // NOLIBDEVICE-NOT: "-target-feature" "+ptx42"