Index: lib/Headers/CMakeLists.txt =================================================================== --- lib/Headers/CMakeLists.txt +++ lib/Headers/CMakeLists.txt @@ -17,6 +17,7 @@ bmiintrin.h cpuid.h cuda_builtin_vars.h + clang_cuda_support.h emmintrin.h f16cintrin.h float.h Index: lib/Headers/clang_cuda_support.h =================================================================== --- /dev/null +++ lib/Headers/clang_cuda_support.h @@ -0,0 +1,121 @@ +#ifndef __CLANG_CUDA_SUPPORT_H__ +#define __CLANG_CUDA_SUPPORT_H__ + +#if defined(__PTX__) + +#define __NVCC__ 1 +#if defined(__CUDA_ARCH__) +#define __CUDABE__ 1 +#else +#define __CUDACC__ 1 +#endif + +// Fake include guards to prevent inclusion of some CUDA headers. +#define __HOST_DEFINES_H__ +#define __DEVICE_LAUNCH_PARAMETERS_H__ +#define __TEXTURE_INDIRECT_FUNCTIONS_HPP__ +#define __SURFACE_INDIRECT_FUNCTIONS_HPP__ + +// Standard CUDA attributes +#define __constant__ __attribute__((constant)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __host__ __attribute__((host)) +#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) +#define __shared__ __attribute__((shared)) + +// Additional macros used throughout CUDA headers. +#define __align__(x) __attribute__((aligned(x))) +#define __builtin_align__(x) __align__(x) +#define __cudart_builtin__ +#define __device_builtin__ +#define __forceinline__ __inline__ __attribute__((always_inline)) + +#define CUDARTAPI +#define _CRTIMP + +// Texture and surface types are not supported yet. +#define __device_builtin_surface_type__ +#define __device_builtin_texture_type__ + +// Include support for built-in variables. +#include "cuda_builtin_vars.h" + +// CUDA headers were implemented with the assumption of split-mode +// compilation and present CUDA functions differently for host and +// device mode. Typically in host mode they provide declarations with +// __device__ attribute attached. In device mode we get definitions +// but *without* __device__ attribute. This does not work well in +// combined compilation mode used by clang, so we have to trick CUDA +// headers into something we can use. + +// WARNING: Preprocessor hacks below are based on specific of +// implementation of CUDA-7.0 headers and are expected to break with +// any other version of CUDA headers. + +// libdevice functions in device_functions_decls.h either come with +// __host__ __device__ attributes or with none at all. Temporarily +// undefine __host__ so only __device__ is applied. +#pragma push_macro("__CUDACC_RTC__") +#pragma push_macro("__host__") +#define __CUDACC_RTC__ +#define __host__ +#include "device_functions_decls.h" +#pragma pop_macro("__host__") +#pragma pop_macro("__CUDACC_RTC__") + +#include "cuda_runtime.h" +#include "crt/device_runtime.h" + +#if defined(__CUDA_ARCH__) +// device_functions.hpp and math_functions*.hpp use 'static +// __forceinline__' (with no __device__) for definitions of device +// functions. Temporarily redefine __forceinline__ to include +// __device__. +#pragma push_macro("__forceinline__") +#define __forceinline__ __device__ __inline__ __attribute__((always_inline)) +#include "device_functions.h" +#include "math_functions.h" +#pragma pop_macro("__forceinline__") +#else +#include "device_functions.h" +#include "math_functions.h" +#endif + +#if defined(__CUDA_ARCH__) +// Definitions for device specific functions are provided only if +// __CUDACC__ is defined. Alas, they've already been transitively +// included by device_functions.h and are now behind include guards. +// We need to temporarily define __CUDACC__, undo include guards and +// include the files with implementation of these functions. + +#pragma push_macro("__CUDACC__") +#define __CUDACC__ 1 + +#undef __DEVICE_ATOMIC_FUNCTIONS_HPP__ +#include "device_atomic_functions.hpp" + +#undef __SM_20_ATOMIC_FUNCTIONS_HPP__ +#include "sm_20_atomic_functions.hpp" +#undef __SM_32_ATOMIC_FUNCTIONS_HPP__ +#include "sm_32_atomic_functions.hpp" + +#undef __SM_20_INTRINSICS_HPP__ +#include "sm_20_intrinsics.hpp" +#undef __SM_30_INTRINSICS_HPP__ +#include "sm_30_intrinsics.hpp" +#undef __SM_32_INTRINSICS_HPP__ +#include "sm_32_intrinsics.hpp" + +#pragma pop_macro("__CUDACC__") + +// Cuda headers pull in stdlib.h on the host side of compilation, and +// a lot of existing CUDA code assumes it. Because clang sees both +// host and device side of CUDA code simultaneously, we've got to +// include stdlib.h on device side as well. + +#include + +#endif // __CUDA_ARCH__ +#endif // __PTX__ +#endif // __CLANG_CUDA_SUPPORT_H__