Index: lib/Headers/CMakeLists.txt
===================================================================
--- lib/Headers/CMakeLists.txt
+++ lib/Headers/CMakeLists.txt
@@ -17,6 +17,7 @@
   bmiintrin.h
   cpuid.h
   cuda_builtin_vars.h
+  clang_cuda_support.h
   emmintrin.h
   f16cintrin.h
   float.h
Index: lib/Headers/clang_cuda_support.h
===================================================================
--- /dev/null
+++ lib/Headers/clang_cuda_support.h
@@ -0,0 +1,119 @@
+#ifndef __CLANG_CUDA_SUPPORT_H__
+#define __CLANG_CUDA_SUPPORT_H__
+
+#if defined(__PTX__)
+
+// WARNING: Preprocessor hacks below are based on specific of
+// implementation of CUDA-7.0 headers and are expected to break with
+// any other version of CUDA headers.
+#include "cuda.h"
+#if !defined(CUDA_VERSION)
+#error "cuda.h did not define CUDA_VERSION"
+#elif CUDA_VERSION != 7000
+#error "Unsupported CUDA version!"
+#endif
+
+#define __NVCC__ 1
+#if defined(__CUDA_ARCH__)
+#define __CUDABE__ 1
+#else
+#define __CUDACC__ 1
+#endif
+
+// Fake include guards to prevent inclusion of some CUDA headers.
+#define __HOST_DEFINES_H__
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+#define __TEXTURE_INDIRECT_FUNCTIONS_HPP__
+#define __SURFACE_INDIRECT_FUNCTIONS_HPP__
+
+// Standard CUDA attributes
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __shared__ __attribute__((shared))
+
+// Additional macros used throughout CUDA headers.
+#define __align__(x) __attribute__((aligned(x)))
+#define __builtin_align__(x) __align__(x)
+#define __cudart_builtin__
+#define __device_builtin__
+#define __forceinline__ __inline__ __attribute__((always_inline))
+
+#define CUDARTAPI
+#define _CRTIMP
+
+// Texture and surface types are not supported yet.
+#define __device_builtin_surface_type__
+#define __device_builtin_texture_type__
+
+// Include support for built-in variables.
+#include "cuda_builtin_vars.h"
+
+// CUDA headers were implemented with the assumption of split-mode
+// compilation and present CUDA functions differently for host and
+// device mode. Typically in host mode they provide declarations with
+// __device__ attribute attached. In device mode we get definitions
+// but *without* __device__ attribute. This does not work well in
+// combined compilation mode used by clang, so we have to trick CUDA
+// headers into something we can use.
+
+// libdevice functions in device_functions_decls.h either come with
+// __host__ __device__ attributes or with none at all. Temporarily
+// undefine __host__ so only __device__ is applied.
+#pragma push_macro("__CUDACC_RTC__")
+#pragma push_macro("__host__")
+#define __CUDACC_RTC__
+#define __host__
+#include "device_functions_decls.h"
+#pragma pop_macro("__host__")
+#pragma pop_macro("__CUDACC_RTC__")
+
+#include "cuda_runtime.h"
+#include "crt/device_runtime.h"
+
+#if defined(__CUDA_ARCH__)
+// device_functions.hpp and math_functions*.hpp use 'static
+// __forceinline__' (with no __device__) for definitions of device
+// functions. Temporarily redefine __forceinline__ to include
+// __device__.
+#pragma push_macro("__forceinline__")
+#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+#include "device_functions.h"
+#include "math_functions.h"
+#pragma pop_macro("__forceinline__")
+#else
+#include "device_functions.h"
+#include "math_functions.h"
+#endif
+
+#if defined(__CUDA_ARCH__)
+// Definitions for device specific functions are provided only if
+// __CUDACC__ is defined. Alas, they've already been transitively
+// included by device_functions.h and are now behind include guards.
+// We need to temporarily define __CUDACC__, undo include guards and
+// include the files with implementation of these functions.
+
+#pragma push_macro("__CUDACC__")
+#define __CUDACC__ 1
+
+#undef __DEVICE_ATOMIC_FUNCTIONS_HPP__
+#include "device_atomic_functions.hpp"
+
+#undef __SM_20_ATOMIC_FUNCTIONS_HPP__
+#include "sm_20_atomic_functions.hpp"
+#undef __SM_32_ATOMIC_FUNCTIONS_HPP__
+#include "sm_32_atomic_functions.hpp"
+
+#undef __SM_20_INTRINSICS_HPP__
+#include "sm_20_intrinsics.hpp"
+#undef __SM_30_INTRINSICS_HPP__
+#include "sm_30_intrinsics.hpp"
+#undef __SM_32_INTRINSICS_HPP__
+#include "sm_32_intrinsics.hpp"
+
+#pragma pop_macro("__CUDACC__")
+#endif // __CUDA_ARCH__
+#endif // __PTX__
+#endif // __CLANG_CUDA_SUPPORT_H__