Index: lib/Driver/ToolChains.cpp =================================================================== --- lib/Driver/ToolChains.cpp +++ lib/Driver/ToolChains.cpp @@ -3815,8 +3815,11 @@ if (DriverArgs.hasArg(options::OPT_nocudainc)) return; - if (CudaInstallation.isValid()) + if (CudaInstallation.isValid()) { addSystemInclude(DriverArgs, CC1Args, CudaInstallation.getIncludePath()); + CC1Args.push_back("-include"); + CC1Args.push_back("cuda_runtime.h"); + } } bool Linux::isPIEDefault() const { return getSanitizerArgs().requiresPIE(); } Index: lib/Headers/CMakeLists.txt =================================================================== --- lib/Headers/CMakeLists.txt +++ lib/Headers/CMakeLists.txt @@ -17,6 +17,7 @@ bmiintrin.h cpuid.h cuda_builtin_vars.h + cuda_runtime.h emmintrin.h f16cintrin.h float.h Index: lib/Headers/cuda_runtime.h =================================================================== --- /dev/null +++ lib/Headers/cuda_runtime.h @@ -0,0 +1,119 @@ +#ifndef __CLANG_CUDA_SUPPORT_H__ +#define __CLANG_CUDA_SUPPORT_H__ + +#if defined(__PTX__) + +// WARNING: Preprocessor hacks below are based on specific of +// implementation of CUDA-7.0 headers and are expected to break with +// any other version of CUDA headers. +#include "cuda.h" +#if !defined(CUDA_VERSION) +#error "cuda.h did not define CUDA_VERSION" +#elif CUDA_VERSION != 7000 +#error "Unsupported CUDA version!" +#endif + +#define __NVCC__ 1 +#if defined(__CUDA_ARCH__) +#define __CUDABE__ 1 +#else +#define __CUDACC__ 1 +#endif + +// Fake include guards to prevent inclusion of some CUDA headers. +#define __HOST_DEFINES_H__ +#define __DEVICE_LAUNCH_PARAMETERS_H__ +#define __TEXTURE_INDIRECT_FUNCTIONS_HPP__ +#define __SURFACE_INDIRECT_FUNCTIONS_HPP__ + +// Standard CUDA attributes +#define __constant__ __attribute__((constant)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __host__ __attribute__((host)) +#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) +#define __shared__ __attribute__((shared)) + +// Additional macros used throughout CUDA headers. +#define __align__(x) __attribute__((aligned(x))) +#define __builtin_align__(x) __align__(x) +#define __cudart_builtin__ +#define __device_builtin__ +#define __forceinline__ __inline__ __attribute__((always_inline)) + +#define CUDARTAPI +#define _CRTIMP + +// Texture and surface types are not supported yet. +#define __device_builtin_surface_type__ +#define __device_builtin_texture_type__ + +// Include support for built-in variables. +#include "cuda_builtin_vars.h" + +// CUDA headers were implemented with the assumption of split-mode +// compilation and present CUDA functions differently for host and +// device mode. Typically in host mode they provide declarations with +// __device__ attribute attached. In device mode we get definitions +// but *without* __device__ attribute. This does not work well in +// combined compilation mode used by clang, so we have to trick CUDA +// headers into something we can use. + +// libdevice functions in device_functions_decls.h either come with +// __host__ __device__ attributes or with none at all. Temporarily +// undefine __host__ so only __device__ is applied. +#pragma push_macro("__CUDACC_RTC__") +#pragma push_macro("__host__") +#define __CUDACC_RTC__ +#define __host__ +#include "device_functions_decls.h" +#pragma pop_macro("__host__") +#pragma pop_macro("__CUDACC_RTC__") + +#include_next "cuda_runtime.h" +#include "crt/device_runtime.h" + +#if defined(__CUDA_ARCH__) +// device_functions.hpp and math_functions*.hpp use 'static +// __forceinline__' (with no __device__) for definitions of device +// functions. Temporarily redefine __forceinline__ to include +// __device__. +#pragma push_macro("__forceinline__") +#define __forceinline__ __device__ __inline__ __attribute__((always_inline)) +#include "device_functions.h" +#include "math_functions.h" +#pragma pop_macro("__forceinline__") +#else +#include "device_functions.h" +#include "math_functions.h" +#endif + +#if defined(__CUDA_ARCH__) +// Definitions for device specific functions are provided only if +// __CUDACC__ is defined. Alas, they've already been transitively +// included by device_functions.h and are now behind include guards. +// We need to temporarily define __CUDACC__, undo include guards and +// include the files with implementation of these functions. + +#pragma push_macro("__CUDACC__") +#define __CUDACC__ 1 + +#undef __DEVICE_ATOMIC_FUNCTIONS_HPP__ +#include "device_atomic_functions.hpp" + +#undef __SM_20_ATOMIC_FUNCTIONS_HPP__ +#include "sm_20_atomic_functions.hpp" +#undef __SM_32_ATOMIC_FUNCTIONS_HPP__ +#include "sm_32_atomic_functions.hpp" + +#undef __SM_20_INTRINSICS_HPP__ +#include "sm_20_intrinsics.hpp" +#undef __SM_30_INTRINSICS_HPP__ +#include "sm_30_intrinsics.hpp" +#undef __SM_32_INTRINSICS_HPP__ +#include "sm_32_intrinsics.hpp" + +#pragma pop_macro("__CUDACC__") +#endif // __CUDA_ARCH__ +#endif // __PTX__ +#endif // __CLANG_CUDA_SUPPORT_H__