Index: lib/Driver/ToolChains.cpp
===================================================================
--- lib/Driver/ToolChains.cpp
+++ lib/Driver/ToolChains.cpp
@@ -3812,8 +3812,11 @@
   if (DriverArgs.hasArg(options::OPT_nocudainc))
     return;
 
-  if (CudaInstallation.isValid())
+  if (CudaInstallation.isValid()) {
     addSystemInclude(DriverArgs, CC1Args, CudaInstallation.getIncludePath());
+    CC1Args.push_back("-include");
+    CC1Args.push_back("cuda_runtime.h");
+  }
 }
 
 bool Linux::isPIEDefault() const { return getSanitizerArgs().requiresPIE(); }
Index: lib/Headers/CMakeLists.txt
===================================================================
--- lib/Headers/CMakeLists.txt
+++ lib/Headers/CMakeLists.txt
@@ -17,6 +17,7 @@
   bmiintrin.h
   cpuid.h
   cuda_builtin_vars.h
+  cuda_runtime.h
   emmintrin.h
   f16cintrin.h
   float.h
Index: lib/Headers/cuda_runtime.h
===================================================================
--- /dev/null
+++ lib/Headers/cuda_runtime.h
@@ -0,0 +1,155 @@
+/*===---- cuda_runtime.h - CUDA runtime support ----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_RUNTIME_H__
+#define __CLANG_CUDA_RUNTIME_H__
+
+#if defined(__PTX__)
+
+// Include some standard headers to avoid CUDA headers including them
+// while some required macros (like __THROW) are in a weird state.
+#include <stdlib.h>
+
+// Preserve common macros that will be changed below by us or by CUDA
+// headers.
+#pragma push_macro("__THROW")
+#pragma push_macro("__CUDA_ARCH__")
+
+// WARNING: Preprocessor hacks below are based on specific of
+// implementation of CUDA-7.x headers and are expected to break with
+// any other version of CUDA headers.
+#include "cuda.h"
+#if !defined(CUDA_VERSION)
+#error "cuda.h did not define CUDA_VERSION"
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050
+#error "Unsupported CUDA version!"
+#endif
+
+// Make largest subset of device functions available during host
+// compilation -- SM_35 for the time being.
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 350
+#endif
+
+#include "cuda_builtin_vars.h"
+
+// No need for device_launch_parameters.h as cuda_builtin_vars.h above
+// has taken care of builtin variables declared in the file.
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+
+// {math,device}_functions.h only have declarations of the
+// functions. We don't need them as we're going to pull in their
+// definitions from .hpp files.
+#define __DEVICE_FUNCTIONS_H__
+#define __MATH_FUNCTIONS_H__
+
+#undef __CUDACC__
+#define __CUDABE__
+#include "host_config.h"
+#include "host_defines.h"
+#include "driver_types.h"
+#include "common_functions.h"
+
+#undef __CUDABE__
+#define __CUDACC__
+#include_next "cuda_runtime.h"
+
+#undef __CUDACC__
+#define __CUDABE__
+#include "crt/host_runtime.h"
+#include "crt/device_runtime.h"
+
+// We need decls for functions in CUDA's libdevice woth __device__
+// attribute only. Alas they come either as __host__ __device__ or
+// with no attributes at all. To work around that, define __CUDA_RTC__
+// which produces HD variant and undef __host__ which gives us desided
+// decls with __device__ attribute.
+#pragma push_macro("__host__")
+#define __host__
+#define __CUDACC_RTC__
+#include "device_functions_decls.h"
+#undef __CUDACC_RTC__
+
+// Temporarily poison __host__ macro to ensure it's not used by any of
+// the headers we're about to include.
+#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+
+// device_functions.hpp and math_functions*.hpp use 'static
+// __forceinline__' (with no __device__) for definitions of device
+// functions. Temporarily redefine __forceinline__ to include
+// __device__.
+#pragma push_macro("__forceinline__")
+#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+#include "device_functions.hpp"
+#include "math_functions.hpp"
+#include "math_functions_dbl_ptx3.hpp"
+#pragma pop_macro("__forceinline__")
+
+// For some reason single-argument variant is not always declared by
+// CUDA headers. Alas, device_functions.hpp included below needs it.
+static inline __device__ void __brkpt(int c) { __brkpt(); }
+
+// Now include *.hpp with definitions of various GPU functions.  Alas,
+// a lot of thins get declared/defined with __host__ attribute which
+// we don't want and we have to define it out. We also have to include
+// {device,math}_functions.hpp again in order to extract the other
+// branch of #if/else inside.
+
+#define __host__
+#undef __CUDABE__
+#define __CUDACC__
+#undef __DEVICE_FUNCTIONS_HPP__
+#include "device_functions.hpp"
+#include "device_atomic_functions.hpp"
+#include "sm_20_atomic_functions.hpp"
+#include "sm_32_atomic_functions.hpp"
+#include "sm_20_intrinsics.hpp"
+// sm_30_intrinsics.h has declarations that use default argument, so
+// we have to include it and it will in turn include .hpp
+#include "sm_30_intrinsics.h"
+#include "sm_32_intrinsics.hpp"
+#undef __MATH_FUNCTIONS_HPP__
+#include "math_functions.hpp"
+#pragma pop_macro("__host__")
+
+// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
+#pragma pop_macro("__CUDA_ARCH__")
+#pragma pop_macro("__THROW")
+
+// Set up compiler macros expected to be seen during compilation.
+#undef __CUDABE__
+#define __CUDACC__
+#define __NVCC__
+
+#if defined(__CUDA_ARCH__)
+// We need to emit IR declaration for non-existing __nvvm_reflect to
+// let backend know that it should be treated as const nothrow
+// function which is implicitly assumed by NVVMReflect pass.
+extern "C" __device__ __attribute__((const)) int __nvvm_reflect(const void *);
+static __device__ __attribute__((used)) int __nvvm_reflect_anchor() {
+  return __nvvm_reflect("NONE");
+}
+#endif
+
+#endif // __PTX__
+#endif // __CLANG_CUDA_RUNTIME_H__
Index: test/Driver/cuda-detect.cu
===================================================================
--- test/Driver/cuda-detect.cu
+++ test/Driver/cuda-detect.cu
@@ -3,7 +3,7 @@
 //
 // # Check that we properly detect CUDA installation.
 // RUN: %clang -v --target=i386-unknown-linux \
-// RUN:   --sysroot=/tmp/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN:   --sysroot=%S/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA
 // RUN: %clang -v --target=i386-unknown-linux \
 // RUN:   --sysroot=%S/Inputs/CUDA 2>&1 | FileCheck %s
 // RUN: %clang -v --target=i386-unknown-linux \
@@ -32,6 +32,11 @@
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix NOLIBDEVICE
+// Verify that we don't add include paths, link with libdevice or
+// -include cuda_runtime without valid CUDA installation.
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
+// RUN:   --cuda-path=%S/no-cuda-there %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix NOCUDAINC -check-prefix NOLIBDEVICE
 
 // CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA/usr/local/cuda
 // NOCUDA-NOT: Found CUDA installation:
@@ -43,7 +48,9 @@
 // LIBDEVICE35-SAME: libdevice.compute_35.10.bc
 // LIBDEVICE-SAME: "-target-feature" "+ptx42"
 // CUDAINC-SAME: "-internal-isystem" "{{.*}}/Inputs/CUDA/usr/local/cuda/include"
+// CUDAINC-SAME: "-include" "cuda_runtime.h"
 // NOCUDAINC-NOT: "-internal-isystem" "{{.*}}/Inputs/CUDA/usr/local/cuda/include"
+// NOCUDAINC-NOT: "-include" "cuda_runtime.h"
 // LIBDEVICE-SAME: "-x" "cuda"
 
 // NOLIBDEVICE: "-triple" "nvptx-nvidia-cuda"
Index: unittests/ASTMatchers/ASTMatchersTest.h
===================================================================
--- unittests/ASTMatchers/ASTMatchersTest.h
+++ unittests/ASTMatchers/ASTMatchersTest.h
@@ -178,6 +178,7 @@
   Args.push_back("-xcuda");
   Args.push_back("-fno-ms-extensions");
   Args.push_back("--cuda-host-only");
+  Args.push_back("-nocudainc");
   Args.push_back(CompileArg);
   if (!runToolOnCodeWithArgs(Factory->create(),
                              CudaHeader + Code, Args)) {