Index: include/clang/Driver/Action.h
===================================================================
--- include/clang/Driver/Action.h
+++ include/clang/Driver/Action.h
@@ -41,6 +41,8 @@
   enum ActionClass {
     InputClass = 0,
     BindArchClass,
+    CudaDeviceClass,
+    CudaHostClass,
     PreprocessJobClass,
     PrecompileJobClass,
     AnalyzeJobClass,
@@ -133,6 +135,39 @@
   }
 };
 
+class CudaDeviceAction : public Action {
+  virtual void anchor();
+  /// GPU architecture to bind -- e.g sm_35
+  const char *GpuArchName;
+  bool AtTopLevel;
+
+public:
+  CudaDeviceAction(std::unique_ptr<Action> Input, const char *ArchName,
+                   bool AtTopLevel);
+
+  const char *getGpuArchName() const { return GpuArchName; }
+  bool isAtTopLevel() const { return AtTopLevel; }
+
+  static bool classof(const Action *A) {
+    return A->getKind() == CudaDeviceClass;
+  }
+};
+
+class CudaHostAction : public Action {
+  virtual void anchor();
+  ActionList DeviceActions;
+
+public:
+  CudaHostAction(std::unique_ptr<Action> Input,
+                 const ActionList &DeviceActions);
+  ~CudaHostAction() override;
+
+  ActionList &getDeviceActions() { return DeviceActions; }
+  const ActionList &getDeviceActions() const { return DeviceActions; }
+
+  static bool classof(const Action *A) { return A->getKind() == CudaHostClass; }
+};
+
 class JobAction : public Action {
   virtual void anchor();
 protected:
Index: include/clang/Driver/CC1Options.td
===================================================================
--- include/clang/Driver/CC1Options.td
+++ include/clang/Driver/CC1Options.td
@@ -602,6 +602,8 @@
 // CUDA Options
 //===----------------------------------------------------------------------===//
 
+def cuda_include_gpucode : Separate<["-"], "cuda-include-gpucode">,
+  HelpText<"Incorporate CUDA device-side code.">;
 def fcuda_is_device : Flag<["-"], "fcuda-is-device">,
   HelpText<"Generate code for CUDA device">;
 def fcuda_allow_host_calls_from_host_device : Flag<["-"],
Index: include/clang/Driver/Driver.h
===================================================================
--- include/clang/Driver/Driver.h
+++ include/clang/Driver/Driver.h
@@ -409,6 +409,9 @@
   ///
   /// Will cache ToolChains for the life of the driver object, and create them
   /// on-demand.
+  const ToolChain &getTargetToolChain(const llvm::opt::ArgList &Args,
+                                      llvm::Triple &Target) const;
+
   const ToolChain &getToolChain(const llvm::opt::ArgList &Args,
                                 StringRef DarwinArchName = "") const;
 
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -453,6 +453,10 @@
                                     Group<f_Group>;
 def fno_crash_diagnostics : Flag<["-"], "fno-crash-diagnostics">, Group<f_clang_Group>, Flags<[NoArgumentUnused]>;
 def fcreate_profile : Flag<["-"], "fcreate-profile">, Group<f_Group>;
+def fcuda_device_only : Flag<["-"], "fcuda-device-only">,
+  HelpText<"Do device-side CUDA compilation only">;
+def fcuda_host_only : Flag<["-"], "fcuda-host-only">,
+  HelpText<"Do host-side CUDA compilation only">;
 def fcxx_exceptions: Flag<["-"], "fcxx-exceptions">, Group<f_Group>,
   HelpText<"Enable C++ exceptions">, Flags<[CC1Option]>;
 def fcxx_modules : Flag <["-"], "fcxx-modules">, Group<f_Group>,
@@ -1067,6 +1071,8 @@
 def gsplit_dwarf : Flag<["-"], "gsplit-dwarf">, Group<g_flags_Group>;
 def ggnu_pubnames : Flag<["-"], "ggnu-pubnames">, Group<g_flags_Group>;
 def gdwarf_aranges : Flag<["-"], "gdwarf-aranges">, Group<g_flags_Group>;
+def gpu_architecture : Separate<["-"], "gpu-architecture">,
+  Flags<[DriverOption, HelpHidden]>, HelpText<"CUDA GPU architecture">;
 def headerpad__max__install__names : Joined<["-"], "headerpad_max_install_names">;
 def help : Flag<["-", "--"], "help">, Flags<[CC1Option,CC1AsOption]>,
   HelpText<"Display available options">;
Index: include/clang/Driver/Types.def
===================================================================
--- include/clang/Driver/Types.def
+++ include/clang/Driver/Types.def
@@ -44,6 +44,7 @@
 TYPE("cl",                       CL,           PP_C,            "cl",    "u")
 TYPE("cuda-cpp-output",          PP_CUDA,      INVALID,         "cui",   "u")
 TYPE("cuda",                     CUDA,         PP_CUDA,         "cu",    "u")
+TYPE("cuda",                     CUDA_DEVICE,  PP_CUDA,         "cu",    "")
 TYPE("objective-c-cpp-output",   PP_ObjC,      INVALID,         "mi",    "u")
 TYPE("objc-cpp-output",          PP_ObjC_Alias, INVALID,        "mi",    "u")
 TYPE("objective-c",              ObjC,         PP_ObjC,         "m",     "u")
Index: include/clang/Frontend/CodeGenOptions.h
===================================================================
--- include/clang/Frontend/CodeGenOptions.h
+++ include/clang/Frontend/CodeGenOptions.h
@@ -160,6 +160,11 @@
   /// Name of the profile file to use as input for -fprofile-instr-use
   std::string InstrProfileInput;
 
+  /// A list of file names passed with -cuda-include-gpucode options to forward
+  /// to CUDA runtime back-end for incorporating them into host-side object
+  /// file.
+  std::vector<std::string> CudaGpuCodeFileNames;
+
   /// Regular expression to select optimizations for which we should enable
   /// optimization remarks. Transformation passes whose name matches this
   /// expression (and support this feature), will emit a diagnostic
Index: lib/CodeGen/CGCUDANV.cpp
===================================================================
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Verifier.h"
 #include <vector>
 
 using namespace clang;
@@ -30,29 +31,61 @@
 class CGNVCUDARuntime : public CGCUDARuntime {
 
 private:
-  llvm::Type *IntTy, *SizeTy;
-  llvm::PointerType *CharPtrTy, *VoidPtrTy;
+  llvm::Type *IntTy, *SizeTy, *VoidTy;
+  llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
+
+  /// Convenience reference to LLVM Context
+  llvm::LLVMContext &VMContext;
+  /// Convenience reference to the current module
+  llvm::Module &TheModule;
+  llvm::SmallVector<llvm::GlobalVariable *, 16> FatbinHandles;
 
   llvm::Constant *getSetupArgumentFn() const;
   llvm::Constant *getLaunchFn() const;
 
+  /// Creates a function to register all kernel stubs generated in this module.
+  llvm::Function *makeRegisterKernelsFn();
+
+  /// Helper function that generates a constant string and returns a pointer to
+  /// the start of the string.  The result of this function can be used anywhere
+  /// where the C code specifies const char*.
+  llvm::Constant *MakeConstantString(const std::string &Str,
+                                     const std::string &Name = "",
+                                     unsigned Alignment = 0) {
+    llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
+                               llvm::ConstantInt::get(SizeTy, 0)};
+    llvm::Constant *ConstStr =
+        CGM.GetAddrOfConstantCString(Str, Name.c_str(), Alignment);
+    return llvm::ConstantExpr::getGetElementPtr(ConstStr, Zeros);
+  }
+
+  void EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) override;
+
 public:
   CGNVCUDARuntime(CodeGenModule &CGM);
 
-  void EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) override;
+  /// Creates module constructor function
+  llvm::Function *ModuleCtorFunction() override;
+  /// Creates module destructor function
+  llvm::Function *ModuleDtorFunction() override;
 };
 
 }
 
-CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM) {
+CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
+    : CGCUDARuntime(CGM), VMContext(CGM.getLLVMContext()),
+      TheModule(CGM.getModule()) {
   CodeGen::CodeGenTypes &Types = CGM.getTypes();
   ASTContext &Ctx = CGM.getContext();
 
   IntTy = Types.ConvertType(Ctx.IntTy);
   SizeTy = Types.ConvertType(Ctx.getSizeType());
+  VoidTy = llvm::Type::getVoidTy(VMContext);
 
   CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
   VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
+  VoidPtrPtrTy = VoidPtrTy->getPointerTo();
+
 }
 
 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
@@ -68,11 +101,8 @@
 
 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const {
   // cudaError_t cudaLaunch(char *)
-  std::vector<llvm::Type*> Params;
-  Params.push_back(CharPtrTy);
-  return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy,
-                                                           Params, false),
-                                   "cudaLaunch");
+  return CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
 }
 
 void CGNVCUDARuntime::EmitDeviceStubBody(CodeGenFunction &CGF,
@@ -87,8 +117,7 @@
     assert(isa<llvm::PointerType>(V->getType()) && "Arg type not PointerType");
     ArgTypes.push_back(cast<llvm::PointerType>(V->getType())->getElementType());
   }
-  llvm::StructType *ArgStackTy = llvm::StructType::get(
-      CGF.getLLVMContext(), ArgTypes);
+  llvm::StructType *ArgStackTy = llvm::StructType::get(VMContext, ArgTypes);
 
   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
 
@@ -120,6 +149,158 @@
   CGF.EmitBlock(EndBlock);
 }
 
+/// Creates internal function to register all kernel stubs generated in this
+/// module with CUDA runtime.
+/// \code
+/// void .cuda_register_kernels(void** GpuBlobHandle) {
+///    __cudaRegisterFunction(GpuBlobHandle,Kernel0,...);
+///    ...
+///    __cudaRegisterFunction(GpuBlobHandle,KernelM,...);
+/// }
+/// \endcode
+llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
+  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
+      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
+      llvm::GlobalValue::InternalLinkage, ".cuda_register_kernels", &TheModule);
+  llvm::BasicBlock *EntryBB =
+      llvm::BasicBlock::Create(VMContext, "entry", RegisterKernelsFunc);
+  CGBuilderTy Builder(VMContext);
+  Builder.SetInsertPoint(EntryBB);
+
+  // void __cudaRegisterFunction(void **, const char *, char *, const char *,
+  //                             int, uint3*, uint3*, dim3*, dim3*, int*)
+  std::vector<llvm::Type *> RegisterFuncParams = {
+      VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
+      VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
+  llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
+      "__cudaRegisterFunction");
+
+  llvm::Argument &BlobHandlePtr = *RegisterKernelsFunc->arg_begin();
+  for (llvm::Function *Kernel : EmittedKernels) {
+    llvm::Constant *KernelName = MakeConstantString(Kernel->getName());
+    llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
+
+    llvm::Value *args[] = {
+        &BlobHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), KernelName,
+        KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, NullPtr,
+        NullPtr, NullPtr,
+        llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
+
+    Builder.CreateCall(RegisterFunc, args);
+  }
+
+  Builder.CreateRetVoid();
+
+  llvm::verifyFunction(*RegisterKernelsFunc);
+  return RegisterKernelsFunc;
+}
+
+/// Creates a global constructor function for the module:
+/// \code
+/// void .cuda_module_ctor(void*) {
+///     Handle0 = __cudaRegisterFatBinary(GpuCodeBlob0);
+///     .cuda_register_kernels(Handle0);
+///     ...
+///     HandleN = __cudaRegisterFatBinary(GpuCodeBlobN);
+///     .cuda_register_kernels(HandleN);
+/// }
+/// \endcode
+llvm::Function *CGNVCUDARuntime::ModuleCtorFunction() {
+  // void .cuda_register_kernels(void* handle);
+  llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn();
+  // void ** __cudaRegisterFatBinary(void *);
+  llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
+      "__cudaRegisterFatBinary");
+  // struct { int magic, int version, void * gpu_blob, void * dont_care };
+  llvm::StructType *FatbinWrapperTy =
+      llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr);
+
+  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
+      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
+      llvm::GlobalValue::InternalLinkage, ".cuda_module_ctor", &TheModule);
+  llvm::BasicBlock *CtorEntryBB =
+      llvm::BasicBlock::Create(VMContext, "entry", ModuleCtorFunc);
+  CGBuilderTy CtorBuilder(VMContext);
+
+  CtorBuilder.SetInsertPoint(CtorEntryBB);
+
+  for (const std::string &GpuCodeFileName :
+       CGM.getCodeGenOpts().CudaGpuCodeFileNames) {
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CodeOrErr =
+        llvm::MemoryBuffer::getFileOrSTDIN(GpuCodeFileName);
+    if (std::error_code EC = CodeOrErr.getError()) {
+      CGM.getDiags().Report(diag::err_cannot_open_file) << GpuCodeFileName
+                                                        << EC.message();
+      continue;
+    }
+
+    // Create initialized wrapper structure that points to the loaded GPU blob.
+    llvm::Constant *Values[] = {
+        llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic.
+        llvm::ConstantInt::get(IntTy, 1),          // Fatbin version.
+        MakeConstantString(CodeOrErr.get()->getBuffer(), "", 16), // Data.
+        llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1.
+    llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable(
+        TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage,
+        llvm::ConstantStruct::get(FatbinWrapperTy, Values),
+        ".cuda_fatbin_wrapper");
+    FatbinWrapper->setAlignment(8);
+
+    // FatbinHandle == __cudaRegisterFatBinary(&FatbinWrapper);
+    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
+        RegisterFatbinFunc,
+        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
+    llvm::GlobalVariable *FatbinHandle = new llvm::GlobalVariable(
+        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
+        llvm::ConstantPointerNull::get(VoidPtrPtrTy), ".cuda_fatbin_handle");
+    CtorBuilder.CreateStore(RegisterFatbinCall, FatbinHandle, false);
+
+    // Call .cuda_register_kernels(FatbinHandle);
+    CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall);
+
+    // Save FatbinHandle so we can unregister it in destructor.
+    FatbinHandles.push_back(FatbinHandle);
+  }
+  CtorBuilder.CreateRetVoid();
+  llvm::verifyFunction(*ModuleCtorFunc);
+  return ModuleCtorFunc;
+}
+
+/// Creates a global destructor function that unregisters all GPU code blobs
+/// registered by constructor.
+/// \code
+/// void .cuda_module_dtor(void*) {
+///     __cudaUnregisterFatBinary(Handle0);
+///     ...
+///     __cudaUnregisterFatBinary(HandleN);
+/// }
+/// \endcode
+llvm::Function *CGNVCUDARuntime::ModuleDtorFunction() {
+  // void __cudaUnregisterFatBinary(void ** handle);
+  llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
+      "__cudaUnregisterFatBinary");
+
+  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
+      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
+      llvm::GlobalValue::InternalLinkage, ".cuda_module_dtor", &TheModule);
+  llvm::BasicBlock *DtorEntryBB =
+      llvm::BasicBlock::Create(VMContext, "entry", ModuleDtorFunc);
+  CGBuilderTy DtorBuilder(VMContext);
+  DtorBuilder.SetInsertPoint(DtorEntryBB);
+
+  for (llvm::GlobalVariable *FatbinHandle : FatbinHandles) {
+    DtorBuilder.CreateCall(UnregisterFatbinFunc,
+                           DtorBuilder.CreateLoad(FatbinHandle, false));
+  }
+
+  DtorBuilder.CreateRetVoid();
+  llvm::verifyFunction(*ModuleDtorFunc);
+  return ModuleDtorFunc;
+}
+
 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
   return new CGNVCUDARuntime(CGM);
 }
Index: lib/CodeGen/CGCUDARuntime.h
===================================================================
--- lib/CodeGen/CGCUDARuntime.h
+++ lib/CodeGen/CGCUDARuntime.h
@@ -16,6 +16,13 @@
 #ifndef LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H
 #define LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H
 
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+class Function;
+class GlobalVariable;
+}
+
 namespace clang {
 
 class CUDAKernelCallExpr;
@@ -32,6 +39,8 @@
 protected:
   CodeGenModule &CGM;
 
+  llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
+
 public:
   CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {}
   virtual ~CGCUDARuntime();
@@ -39,10 +48,23 @@
   virtual RValue EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
                                         const CUDAKernelCallExpr *E,
                                         ReturnValueSlot ReturnValue);
-  
+
+  /// Adds CGF.CurFn to EmittedKernels and calls EmitDeviceStubBody() to emit a
+  /// kernel launch stub.
+  virtual void EmitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args);
+
+  /// Constructs and returns a module initialization function or nullptr if it's
+  /// not needed. Must be called after all kernels have been emitted.
+  virtual llvm::Function *ModuleCtorFunction() = 0;
+
+  /// Returns a module cleanup function or nullptr if it's not needed.
+  /// Must be called after ModuleCtorFunction
+  virtual llvm::Function *ModuleDtorFunction() = 0;
+
+private:
+  /// Emits kernel launch stub.
   virtual void EmitDeviceStubBody(CodeGenFunction &CGF,
                                   FunctionArgList &Args) = 0;
-
 };
 
 /// Creates an instance of a CUDA runtime class.
Index: lib/CodeGen/CGCUDARuntime.cpp
===================================================================
--- lib/CodeGen/CGCUDARuntime.cpp
+++ lib/CodeGen/CGCUDARuntime.cpp
@@ -53,3 +53,9 @@
 
   return RValue::get(nullptr);
 }
+
+void CGCUDARuntime::EmitDeviceStub(CodeGenFunction &CGF,
+                                   FunctionArgList &Args) {
+  EmittedKernels.push_back(CGF.CurFn);
+  EmitDeviceStubBody(CGF, Args);
+}
Index: lib/CodeGen/CodeGenFunction.cpp
===================================================================
--- lib/CodeGen/CodeGenFunction.cpp
+++ lib/CodeGen/CodeGenFunction.cpp
@@ -858,7 +858,7 @@
   else if (getLangOpts().CUDA &&
            !getLangOpts().CUDAIsDevice &&
            FD->hasAttr<CUDAGlobalAttr>())
-    CGM.getCUDARuntime().EmitDeviceStubBody(*this, Args);
+    CGM.getCUDARuntime().EmitDeviceStub(*this, Args);
   else if (isa<CXXConversionDecl>(FD) &&
            cast<CXXConversionDecl>(FD)->isLambdaToBlockPointerConversion()) {
     // The lambda conversion to block pointer is special; the semantics can't be
Index: lib/CodeGen/CodeGenModule.cpp
===================================================================
--- lib/CodeGen/CodeGenModule.cpp
+++ lib/CodeGen/CodeGenModule.cpp
@@ -350,6 +350,13 @@
   if (ObjCRuntime)
     if (llvm::Function *ObjCInitFunction = ObjCRuntime->ModuleInitFunction())
       AddGlobalCtor(ObjCInitFunction);
+  if (Context.getLangOpts().CUDA && !Context.getLangOpts().CUDAIsDevice &&
+      CUDARuntime) {
+    if (llvm::Function *CudaCtorFunction = CUDARuntime->ModuleCtorFunction())
+      AddGlobalCtor(CudaCtorFunction);
+    if (llvm::Function *CudaDtorFunction = CUDARuntime->ModuleDtorFunction())
+      AddGlobalDtor(CudaDtorFunction);
+  }
   if (PGOReader && PGOStats.hasDiagnostics())
     PGOStats.reportDiagnostics(getDiags(), getCodeGenOpts().MainFileName);
   EmitCtorList(GlobalCtors, "llvm.global_ctors");
Index: lib/Driver/Action.cpp
===================================================================
--- lib/Driver/Action.cpp
+++ lib/Driver/Action.cpp
@@ -24,6 +24,8 @@
   switch (AC) {
   case InputClass: return "input";
   case BindArchClass: return "bind-arch";
+  case CudaDeviceClass: return "cuda-device";
+  case CudaHostClass: return "cuda-host";
   case PreprocessJobClass: return "preprocessor";
   case PrecompileJobClass: return "precompiler";
   case AnalyzeJobClass: return "analyzer";
@@ -53,6 +55,25 @@
                                const char *_ArchName)
     : Action(BindArchClass, std::move(Input)), ArchName(_ArchName) {}
 
+void CudaDeviceAction::anchor() {}
+
+CudaDeviceAction::CudaDeviceAction(std::unique_ptr<Action> Input,
+                                   const char *ArchName, bool AtTopLevel)
+    : Action(CudaDeviceClass, std::move(Input)), GpuArchName(ArchName),
+      AtTopLevel(AtTopLevel) {}
+
+void CudaHostAction::anchor() {}
+
+CudaHostAction::CudaHostAction(std::unique_ptr<Action> Input,
+                               const ActionList &_DeviceActions)
+    : Action(CudaHostClass, std::move(Input)), DeviceActions(_DeviceActions) {}
+
+CudaHostAction::~CudaHostAction() {
+  for (iterator it = DeviceActions.begin(), ie = DeviceActions.end(); it != ie;
+       ++it)
+    delete *it;
+}
+
 void JobAction::anchor() {}
 
 JobAction::JobAction(ActionClass Kind, std::unique_ptr<Action> Input,
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -180,10 +180,11 @@
   } else if ((PhaseArg = DAL.getLastArg(options::OPT_S))) {
     FinalPhase = phases::Backend;
 
-    // -c only runs up to the assembler.
-  } else if ((PhaseArg = DAL.getLastArg(options::OPT_c))) {
+    // -c and partial CUDA compilations only runs up to the assembler.
+  } else if ((PhaseArg = DAL.getLastArg(options::OPT_c)) ||
+             (PhaseArg = DAL.getLastArg(options::OPT_fcuda_device_only)) ||
+             (PhaseArg = DAL.getLastArg(options::OPT_fcuda_host_only))) {
     FinalPhase = phases::Assemble;
-
     // Otherwise do everything.
   } else
     FinalPhase = phases::Link;
@@ -819,7 +820,25 @@
 }
 
 static unsigned PrintActions1(const Compilation &C, Action *A,
-                              std::map<Action*, unsigned> &Ids) {
+                              std::map<Action *, unsigned> &Ids);
+
+static std::string PrintActionList(const Compilation &C, ActionList &AL,
+                                   std::map<Action *, unsigned> &Ids) {
+  std::string str;
+  llvm::raw_string_ostream os(str);
+  os << "{";
+  for (Action::iterator it = AL.begin(), ie = AL.end(); it != ie;) {
+    os << PrintActions1(C, *it, Ids);
+    ++it;
+    if (it != ie)
+      os << ", ";
+  }
+  os << "}";
+  return str;
+}
+
+static unsigned PrintActions1(const Compilation &C, Action *A,
+                              std::map<Action *, unsigned> &Ids) {
   if (Ids.count(A))
     return Ids[A];
 
@@ -832,15 +851,14 @@
   } else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) {
     os << '"' << BIA->getArchName() << '"'
        << ", {" << PrintActions1(C, *BIA->begin(), Ids) << "}";
+  } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
+    os << '"' << CDA->getGpuArchName() << '"' << ", {"
+       << PrintActions1(C, *CDA->begin(), Ids) << "}";
+  } else if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
+    os << "{" << PrintActions1(C, *CHA->begin(), Ids) << "}"
+       << ", gpucode " << PrintActionList(C, CHA->getDeviceActions(), Ids);
   } else {
-    os << "{";
-    for (Action::iterator it = A->begin(), ie = A->end(); it != ie;) {
-      os << PrintActions1(C, *it, Ids);
-      ++it;
-      if (it != ie)
-        os << ", ";
-    }
-    os << "}";
+    os << PrintActionList(C, A->getInputs(), Ids);
   }
 
   unsigned Id = Ids.size();
@@ -1149,6 +1167,77 @@
   }
 }
 
+// For eash unique --gpu-architecture argument creates a TY_CUDA_DEVICE input
+// action and then wraps each in CudaDeviceAction paired with appropriate GPU
+// arch name. If we're only building device-side code, each action remains
+// independent. Otherwise we pass device-side actions as inputs to a new
+// CudaHostAction which combines both host and device side actions.
+static std::unique_ptr<Action>
+BuildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
+                 const Arg *InputArg, const types::ID InputType,
+                 std::unique_ptr<Action> Current, ActionList &Actions) {
+
+  assert(InputType == types::TY_CUDA &&
+         "CUDA Actions only apply to CUDA inputs.");
+
+  SmallVector<const char *, 4> GpuArchList;
+  llvm::StringSet<> GpuArchNames;
+  for (Arg *A : Args) {
+    if (A->getOption().matches(options::OPT_gpu_architecture)) {
+      A->claim();
+      if (GpuArchNames.insert(A->getValue()).second)
+        GpuArchList.push_back(A->getValue());
+    }
+  }
+
+  if (GpuArchList.empty())
+    GpuArchList.push_back("sm_20");
+
+  Driver::InputList CudaDeviceInputs;
+  for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+    CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
+
+  ActionList CudaDeviceActions;
+  D.BuildActions(TC, Args, CudaDeviceInputs, CudaDeviceActions);
+  assert(GpuArchList.size() == CudaDeviceActions.size() &&
+         "Failed to create actions for all devices");
+
+  bool PartialCompilation = false;
+  bool DeviceOnlyCompilation = Args.hasArg(options::OPT_fcuda_device_only);
+  for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) {
+    if (CudaDeviceActions[i]->getKind() != Action::BackendJobClass) {
+      PartialCompilation = true;
+      break;
+    }
+  }
+
+  if (PartialCompilation || DeviceOnlyCompilation) {
+    // If -o specified we can only work if it's device-only compilation for a
+    // single device.
+    if (Args.hasArg(options::OPT_o) &&
+        (!DeviceOnlyCompilation || GpuArchList.size() > 1)){
+      D.Diag(clang::diag::err_drv_output_argument_with_multiple_files);
+      return nullptr;
+    }
+    for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+      Actions.push_back(new CudaDeviceAction(
+          std::unique_ptr<Action>(CudaDeviceActions[i]), GpuArchList[i], true));
+    if (DeviceOnlyCompilation)
+      Current.reset(nullptr);
+    return Current;
+  } else {
+    ActionList CudaDeviceJobActions;
+    for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+      CudaDeviceJobActions.push_back(
+          new CudaDeviceAction(std::unique_ptr<Action>(CudaDeviceActions[i]),
+                               GpuArchList[i], false));
+
+    std::unique_ptr<Action> HostAction(
+        new CudaHostAction(std::move(Current), CudaDeviceJobActions));
+    return HostAction;
+  }
+}
+
 void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
                           const InputList &Inputs, ActionList &Actions) const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation actions");
@@ -1251,8 +1340,26 @@
 
     // Build the pipeline for this file.
     std::unique_ptr<Action> Current(new InputAction(*InputArg, InputType));
-    for (SmallVectorImpl<phases::ID>::iterator
-           i = PL.begin(), e = PL.end(); i != e; ++i) {
+    phases::ID CudaInjectionPhase;
+    if (isSaveTempsEnabled()) {
+      // All phases are done independently, inject GPU blobs during compilation
+      // phase as that's where we generate glue code to init them.
+      CudaInjectionPhase = phases::Compile;
+    } else {
+      // Assumes that clang does everything up until linking phase, so we inject
+      // cuda device actions at the last step before linking. Otherwise CUDA
+      // host action forces preprocessor into a separate invocation.
+      if (FinalPhase == phases::Link) {
+        for (auto i = PL.begin(), e = PL.end(); i != e; ++i) {
+          auto next = i + 1;
+          if (next != e && *next == phases::Link)
+            CudaInjectionPhase = *i;
+        }
+      } else
+        CudaInjectionPhase = FinalPhase;
+    }
+    for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
+         i != e; ++i) {
       phases::ID Phase = *i;
 
       // We are done if this step is past what the user requested.
@@ -1274,6 +1381,15 @@
 
       // Otherwise construct the appropriate action.
       Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
+
+      if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase &&
+          !Args.hasArg(options::OPT_fcuda_host_only)) {
+        Current = BuildCudaActions(*this, TC, Args, InputArg, InputType,
+                                   std::move(Current), Actions);
+        if (!Current)
+          break;
+      }
+
       if (Current->getType() == types::TY_Nothing)
         break;
     }
@@ -1403,10 +1519,14 @@
       if (A->getType() != types::TY_Nothing)
         ++NumOutputs;
 
+#if DISABLED_FOR_NOW
+    // TODO: Cuda compilation has more than one input. Need to figure out how to
+    // detect whether it's a cuda compilation.
     if (NumOutputs > 1) {
       Diag(clang::diag::err_drv_output_argument_with_multiple_files);
       FinalOutput = nullptr;
     }
+#endif
   }
 
   // Collect the list of architectures.
@@ -1521,7 +1641,13 @@
   if (isa<BackendJobAction>(JA)) {
     // Check if the compiler supports emitting LLVM IR.
     assert(Inputs->size() == 1);
-    JobAction *CompileJA = cast<CompileJobAction>(*Inputs->begin());
+    JobAction *CompileJA;
+    // Extract real host action, if it's a CudaHostAction.
+    if (CudaHostAction *CudaHA = dyn_cast<CudaHostAction>(*Inputs->begin()))
+      CompileJA = cast<CompileJobAction>(*CudaHA->begin());
+    else
+      CompileJA = cast<CompileJobAction>(*Inputs->begin());
+
     const Tool *Compiler = TC->SelectTool(*CompileJA);
     if (!Compiler)
       return nullptr;
@@ -1549,6 +1675,10 @@
   return ToolForJob;
 }
 
+static llvm::Triple computeTargetTriple(StringRef DefaultTargetTriple,
+                                        const ArgList &Args,
+                                        StringRef DarwinArchName);
+
 void Driver::BuildJobsForAction(Compilation &C,
                                 const Action *A,
                                 const ToolChain *TC,
@@ -1559,6 +1689,20 @@
                                 InputInfo &Result) const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation jobs");
 
+  InputInfoList CudaDeviceInputInfos;
+  if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
+    InputInfo II;
+    // Append outputs of device jobs to the input list.
+    for (const Action *DA : CHA->getDeviceActions()) {
+      BuildJobsForAction(C, DA, TC, "", AtTopLevel,
+                         /*MultipleArchs*/ false, LinkingOutput, II);
+      CudaDeviceInputInfos.push_back(II);
+    }
+    // Override current action with a real host compile action and continue
+    // processing it.
+    A = *CHA->begin();
+  }
+
   if (const InputAction *IA = dyn_cast<InputAction>(A)) {
     // FIXME: It would be nice to not claim this here; maybe the old scheme of
     // just using Args was better?
@@ -1581,8 +1725,21 @@
     else
       TC = &C.getDefaultToolChain();
 
-    BuildJobsForAction(C, *BAA->begin(), TC, BAA->getArchName(),
-                       AtTopLevel, MultipleArchs, LinkingOutput, Result);
+    BuildJobsForAction(C, *BAA->begin(), TC, ArchName, AtTopLevel,
+                       MultipleArchs, LinkingOutput, Result);
+    return;
+  }
+
+  if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
+    const ToolChain *TC;
+    const char *ArchName = CDA->getGpuArchName();
+    llvm::Triple HostTriple =
+        computeTargetTriple(DefaultTargetTriple, C.getArgs(), "");
+    llvm::Triple TargetTriple(HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda"
+                                                       : "nvptx-nvidia-cuda");
+    TC = &getTargetToolChain(C.getArgs(), TargetTriple);
+    BuildJobsForAction(C, *CDA->begin(), TC, ArchName, CDA->isAtTopLevel(),
+                       /*MultipleArchs*/ true, LinkingOutput, Result);
     return;
   }
 
@@ -1617,6 +1774,10 @@
   if (JA->getType() == types::TY_dSYM)
     BaseInput = InputInfos[0].getFilename();
 
+  // Append outputs of cuda device jobs to the input list
+  if (CudaDeviceInputInfos.size())
+    InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end());
+
   // Determine the place to write output to, if any.
   if (JA->getType() == types::TY_Nothing)
     Result = InputInfo(A->getType(), BaseInput);
@@ -2022,10 +2183,8 @@
   return Target;
 }
 
-const ToolChain &Driver::getToolChain(const ArgList &Args,
-                                      StringRef DarwinArchName) const {
-  llvm::Triple Target = computeTargetTriple(DefaultTargetTriple, Args,
-                                            DarwinArchName);
+const ToolChain &Driver::getTargetToolChain(const ArgList &Args,
+                                            llvm::Triple &Target) const {
 
   ToolChain *&TC = ToolChains[Target.str()];
   if (!TC) {
@@ -2095,6 +2254,9 @@
         break;
       }
       break;
+    case llvm::Triple::CUDA:
+      TC = new toolchains::Cuda(*this, Target, Args);
+      break;
     default:
       // TCE is an OSless target
       if (Target.getArchName() == "tce") {
@@ -2125,6 +2287,13 @@
   return *TC;
 }
 
+const ToolChain &Driver::getToolChain(const ArgList &Args,
+                                      StringRef DarwinArchName) const {
+  llvm::Triple Target =
+      computeTargetTriple(DefaultTargetTriple, Args, DarwinArchName);
+  return getTargetToolChain(Args, Target);
+}
+
 bool Driver::ShouldUseClangCompiler(const JobAction &JA) const {
   // Check if user requested no clang, or clang doesn't understand this type (we
   // only handle single inputs for now).
Index: lib/Driver/ToolChain.cpp
===================================================================
--- lib/Driver/ToolChain.cpp
+++ lib/Driver/ToolChain.cpp
@@ -151,6 +151,8 @@
 
   case Action::InputClass:
   case Action::BindArchClass:
+  case Action::CudaDeviceClass:
+  case Action::CudaHostClass:
   case Action::LipoJobClass:
   case Action::DsymutilJobClass:
   case Action::VerifyDebugInfoJobClass:
Index: lib/Driver/ToolChains.h
===================================================================
--- lib/Driver/ToolChains.h
+++ lib/Driver/ToolChains.h
@@ -690,6 +690,18 @@
   std::string computeSysRoot() const;
 };
 
+class LLVM_LIBRARY_VISIBILITY Cuda : public Linux {
+public:
+  Cuda(const Driver &D, const llvm::Triple &Triple,
+       const llvm::opt::ArgList &Args);
+
+  llvm::opt::DerivedArgList *
+  TranslateArgs(const llvm::opt::DerivedArgList &Args,
+                const char *BoundArch) const override;
+  void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                             llvm::opt::ArgStringList &CC1Args) const override;
+};
+
 class LLVM_LIBRARY_VISIBILITY Hexagon_TC : public Linux {
 protected:
   GCCVersion GCCLibAndIncVersion;
Index: lib/Driver/ToolChains.cpp
===================================================================
--- lib/Driver/ToolChains.cpp
+++ lib/Driver/ToolChains.cpp
@@ -3600,6 +3600,62 @@
   return new tools::dragonfly::Link(*this);
 }
 
+/// Stub for CUDA toolchain. At the moment we don't have assembler or
+/// linker and need toolchain mainly to propagate device-side options
+/// to CC1.
+
+Cuda::Cuda(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
+    : Linux(D, Triple, Args) {}
+
+void Cuda::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                                 llvm::opt::ArgStringList &CC1Args) const {
+  Linux::addClangTargetOptions(DriverArgs, CC1Args);
+  CC1Args.push_back("-fcuda-is-device");
+}
+
+llvm::opt::DerivedArgList *
+Cuda::TranslateArgs(const llvm::opt::DerivedArgList &Args,
+                    const char *BoundArch) const {
+  DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs());
+  const OptTable &Opts = getDriver().getOpts();
+
+  for (Arg *A : Args) {
+    if (A->getOption().matches(options::OPT_Xarch__)) {
+      // Skip this argument unless the architecture matches BoundArch
+      if (A->getValue(0) != StringRef(BoundArch))
+        continue;
+
+      unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
+      unsigned Prev = Index;
+      std::unique_ptr<Arg> XarchArg(Opts.ParseOneArg(Args, Index));
+
+      // If the argument parsing failed or more than one argument was
+      // consumed, the -Xarch_ argument's parameter tried to consume
+      // extra arguments. Emit an error and ignore.
+      //
+      // We also want to disallow any options which would alter the
+      // driver behavior; that isn't going to work in our model. We
+      // use isDriverOption() as an approximation, although things
+      // like -O4 are going to slip through.
+      if (!XarchArg || Index > Prev + 1) {
+        getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args)
+            << A->getAsString(Args);
+        continue;
+      } else if (XarchArg->getOption().hasFlag(options::DriverOption)) {
+        getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver)
+            << A->getAsString(Args);
+        continue;
+      }
+      XarchArg->setBaseArg(A);
+      A = XarchArg.release();
+      DAL->AddSynthesizedArg(A);
+    }
+    DAL->append(A);
+  }
+
+  DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+  return DAL;
+}
 
 /// XCore tool chain
 XCore::XCore(const Driver &D, const llvm::Triple &Triple,
Index: lib/Driver/Tools.h
===================================================================
--- lib/Driver/Tools.h
+++ lib/Driver/Tools.h
@@ -41,6 +41,8 @@
   public:
     static const char *getBaseInputName(const llvm::opt::ArgList &Args,
                                         const InputInfoList &Inputs);
+    static const char *getBaseInputName(const llvm::opt::ArgList &Args,
+                                        const InputInfo &Input);
     static const char *getBaseInputStem(const llvm::opt::ArgList &Args,
                                         const InputInfoList &Inputs);
     static const char *getDependencyFileName(const llvm::opt::ArgList &Args,
Index: lib/Driver/Tools.cpp
===================================================================
--- lib/Driver/Tools.cpp
+++ lib/Driver/Tools.cpp
@@ -1517,6 +1517,12 @@
     return CPUName;
   }
 
+  case llvm::Triple::nvptx:
+  case llvm::Triple::nvptx64:
+    if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
+      return A->getValue();
+    return "";
+
   case llvm::Triple::ppc:
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le: {
@@ -2572,8 +2578,30 @@
   bool IsWindowsCygnus =
       getToolChain().getTriple().isWindowsCygwinEnvironment();
   bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment();
+  bool IsCuda = false;
 
-  assert(Inputs.size() == 1 && "Unable to handle multiple inputs.");
+  assert(Inputs.size() >= 1 && "Must have at least one input.");
+  InputInfoList BaseInputs; // Inputs[0]
+  InputInfoList CudaInputs; // Inputs[1...]
+  const InputInfo &Input = Inputs[0];
+  BaseInputs.push_back(Input);
+
+  if (Inputs.size() > 1) {
+    // Cuda compilation mode may pass more than one file.
+    // Verify that all additional files were derived from the same source.
+    IsCuda = true;
+    StringRef BaseInput(Input.getBaseInput());
+    for (const auto &it : Inputs) {
+      if (BaseInput != StringRef(it.getBaseInput())) {
+        IsCuda = false;
+        break;
+      }
+    }
+    if (IsCuda)
+      CudaInputs.append(std::next(Inputs.begin()), Inputs.end());
+  }
+
+  assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs.");
 
   // Invoke ourselves in -cc1 mode.
   //
@@ -2681,7 +2709,7 @@
   // Set the main file name, so that debug info works even with
   // -save-temps.
   CmdArgs.push_back("-main-file-name");
-  CmdArgs.push_back(getBaseInputName(Args, Inputs));
+  CmdArgs.push_back(getBaseInputName(Args, Input));
 
   // Some flags which affect the language (via preprocessor
   // defines).
@@ -2709,7 +2737,7 @@
       
       CmdArgs.push_back("-analyzer-checker=deadcode");
       
-      if (types::isCXX(Inputs[0].getType()))
+      if (types::isCXX(Input.getType()))
         CmdArgs.push_back("-analyzer-checker=cplusplus");
 
       // Enable the following experimental checkers for testing.
@@ -3237,7 +3265,7 @@
 
   // Explicitly error on some things we know we don't support and can't just
   // ignore.
-  types::ID InputType = Inputs[0].getType();
+  types::ID InputType = Input.getType();
   if (!Args.hasArg(options::OPT_fallow_unsupported)) {
     Arg *Unsupported;
     if (types::isCXX(InputType) &&
@@ -4609,7 +4637,7 @@
     assert(Output.isNothing() && "Invalid output.");
   }
 
-  for (const auto &II : Inputs) {
+  for (const auto &II : BaseInputs) {
     addDashXForInput(Args, II, CmdArgs);
 
     if (II.isFilename())
@@ -4650,16 +4678,26 @@
   const char *SplitDwarfOut;
   if (SplitDwarf) {
     CmdArgs.push_back("-split-dwarf-file");
-    SplitDwarfOut = SplitDebugName(Args, Inputs);
+    SplitDwarfOut = SplitDebugName(Args, BaseInputs);
     CmdArgs.push_back(SplitDwarfOut);
   }
 
+  // Include device-side CUDA code
+  if (IsCuda) {
+    for (InputInfoList::const_iterator it = CudaInputs.begin(),
+                                       ie = CudaInputs.end();
+         it != ie; ++it) {
+      CmdArgs.push_back("-cuda-include-gpucode");
+      CmdArgs.push_back(it->getFilename());
+    }
+  }
+
   // Finally add the compile command to the compilation.
   if (Args.hasArg(options::OPT__SLASH_fallback) &&
       Output.getType() == types::TY_Object &&
       (InputType == types::TY_C || InputType == types::TY_CXX)) {
-    auto CLCommand =
-        getCLFallback()->GetCommand(C, JA, Output, Inputs, Args, LinkingOutput);
+    auto CLCommand = getCLFallback()->GetCommand(C, JA, Output, BaseInputs,
+                                                 Args, LinkingOutput);
     C.addCommand(llvm::make_unique<FallbackCommand>(JA, *this, Exec, CmdArgs,
                                                     std::move(CLCommand)));
   } else {
@@ -5717,9 +5755,13 @@
 }
 
 const char *Clang::getBaseInputName(const ArgList &Args,
+                                    const InputInfo &Input) {
+  return Args.MakeArgString(llvm::sys::path::filename(Input.getBaseInput()));
+}
+
+const char *Clang::getBaseInputName(const ArgList &Args,
                                     const InputInfoList &Inputs) {
-  return Args.MakeArgString(
-    llvm::sys::path::filename(Inputs[0].getBaseInput()));
+  return getBaseInputName(Args, Inputs[0]);
 }
 
 const char *Clang::getBaseInputStem(const ArgList &Args,
Index: lib/Driver/Types.cpp
===================================================================
--- lib/Driver/Types.cpp
+++ lib/Driver/Types.cpp
@@ -86,6 +86,7 @@
   case TY_C: case TY_PP_C:
   case TY_CL:
   case TY_CUDA: case TY_PP_CUDA:
+  case TY_CUDA_DEVICE:
   case TY_ObjC: case TY_PP_ObjC: case TY_PP_ObjC_Alias:
   case TY_CXX: case TY_PP_CXX:
   case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias:
@@ -122,7 +123,7 @@
   case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias:
   case TY_CXXHeader: case TY_PP_CXXHeader:
   case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader:
-  case TY_CUDA: case TY_PP_CUDA:
+  case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE:
     return true;
   }
 }
@@ -206,10 +207,12 @@
         P.push_back(phases::Compile);
         P.push_back(phases::Backend);
       }
-      P.push_back(phases::Assemble);
+      if (Id != TY_CUDA_DEVICE)
+        P.push_back(phases::Assemble);
     }
   }
-  if (!onlyPrecompileType(Id)) {
+
+  if (!onlyPrecompileType(Id) && Id != TY_CUDA_DEVICE) {
     P.push_back(phases::Link);
   }
   assert(0 < P.size() && "Not enough phases in list");
Index: lib/Frontend/CompilerInvocation.cpp
===================================================================
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -639,6 +639,8 @@
                       Args.getAllArgValues(OPT_fsanitize_recover_EQ), Diags,
                       Opts.SanitizeRecover);
 
+  Opts.CudaGpuCodeFileNames = Args.getAllArgValues(OPT_cuda_include_gpucode);
+
   return Success;
 }
 
Index: test/Driver/cuda-options.cu
===================================================================
--- /dev/null
+++ test/Driver/cuda-options.cu
@@ -0,0 +1,108 @@
+// Tests CUDA compilation pipeline construction in Driver.
+
+// Simple compilation case:
+// RUN: %clang -### -nocudainc -c %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side.
+// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// Then compile host side and incorporate device code.
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
+// Make sure we don't link anything.
+// RUN:   -check-prefix CUDA-NL %s
+
+// Typical compilation + link case:
+// RUN: %clang -### -nocudainc %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side
+// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// Then compile host side and incorporate device code.
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
+// Then link things.
+// RUN:   -check-prefix CUDA-L %s
+
+// Verify that -cuda-no-device disables device-side compilation and linking
+// RUN: %clang -### -nocudainc -fcuda-host-only %s 2>&1 \
+// Make sure we didn't run device-side compilation.
+// RUN:   | FileCheck -check-prefix CUDA-ND \
+// Then compile host side and make sure we don't attempt to incorporate GPU code.
+// RUN:    -check-prefix CUDA-H -check-prefix CUDA-H-NI \
+// Make sure we don't link anything.
+// RUN:    -check-prefix CUDA-NL %s
+
+// Verify that -cuda-no-host disables host-side compilation and linking
+// RUN: %clang -### -nocudainc -fcuda-device-only %s 2>&1 \
+// Compile device-side to PTX assembly
+// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// Make sure there are no host cmpilation or linking.
+// RUN:   -check-prefix CUDA-NH -check-prefix CUDA-NL %s
+
+// Verify that with -S we compile host and device sides to assembly
+// and incorporate device code on the host side.
+// RUN: %clang -### -nocudainc -S -c %s 2>&1 \
+// Compile device-side to PTX assembly
+// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// Then compile host side and incorporate GPU code.
+// RUN:  -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
+// Make sure we don't link anything.
+// RUN:  -check-prefix CUDA-NL %s
+
+// Verify that --gpu-architecture option passes correct GPU
+// archtecture info to device compilation.
+// RUN: %clang -### -nocudainc -gpu-architecture sm_35 -c %s 2>&1 \
+// Compile device-side to PTX assembly.
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// Then compile host side and incorporate GPU code.
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
+// Make sure we don't link anything.
+// RUN:   -check-prefix CUDA-NL %s
+
+// Verify that there is device-side compilation per --gpu-architecture args 
+// and that all results are included on the host side.
+// RUN: %clang -### -nocudainc -gpu-architecture sm_35 -gpu-architecture sm_30 -c %s 2>&1 \
+// Compile both device-sides to PTX assembly
+// RUN:   | FileCheck \
+// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// RUN: -check-prefix CUDA-D2 -check-prefix CUDA-D2-SM30 \
+// Then compile host side and incorporate both device-side outputs
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \
+// Make sure we don't link anything.
+// RUN:   -check-prefix CUDA-NL %s
+
+// Match device-side compilation
+// CUDA-D1: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda"
+// CUDA-D1-SAME: "-fcuda-is-device"
+// CUDA-D1-SM35-SAME: "-target-cpu" "sm_35"
+// CUDA-D1-SAME: "-o" "[[GPUCODE1:[^"]*]]"
+// CUDA-D1-SAME: "-x" "cuda"
+
+// Match anothe device-side compilation
+// CUDA-D2: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda"
+// CUDA-D2-SAME: "-fcuda-is-device"
+// CUDA-D2-SM30-SAME: "-target-cpu" "sm_30"
+// CUDA-D2-SAME: "-o" "[[GPUCODE2:[^"]*]]"
+// CUDA-D2-SAME: "-x" "cuda"
+
+// Match no device-side compilation
+// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda"
+// CUDA-ND-SAME-NOT: "-fcuda-is-device"
+
+// Match host-side compilation
+// CUDA-H: "-cc1" "-triple" 
+// CUDA-H-SAME-NOT: "nvptx{{64?}}-nvidia-cuda"
+// CUDA-H-SAME-NOT: "-fcuda-is-device"
+// CUDA-H-SAME: "-o" "[[HOSTOBJ:[^"]*]]"
+// CUDA-H-SAME: "-x" "cuda"
+// CUDA-H-I1-SAME: "-cuda-include-gpucode" "[[GPUCODE1]]"
+// CUDA-H-I2-SAME: "-cuda-include-gpucode" "[[GPUCODE2]]"
+
+// Match no GPU code inclusion.
+// CUDA-H-NI-NOT: "-cuda-include-gpucode"
+
+// Match no CUDA compilation
+// CUDA-NH-NOT: "-cc1" "-triple"
+// CUDA-NH-SAME-NOT: "-x" "cuda"
+
+// Match linker
+// CUDA-L: "{{.*}}ld{{(.exe)?}}"
+// CUDA-L-SAME: "[[HOSTOBJ]]"
+
+// Match no linker
+// CUDA-NL-NOT: "{{.*}}ld{{(.exe)?}}"
Index: test/Index/attributes-cuda.cu
===================================================================
--- test/Index/attributes-cuda.cu
+++ test/Index/attributes-cuda.cu
@@ -1,5 +1,5 @@
-// RUN: c-index-test -test-load-source all -x cuda %s | FileCheck %s
-
+// RUN: c-index-test -test-load-source all -x cuda -nocudainc -fcuda-host-only %s | FileCheck %s
+// RUN: c-index-test -test-load-source all -x cuda -nocudainc -fcuda-device-only %s | FileCheck %s
 __attribute__((device)) void f_device();
 __attribute__((global)) void f_global();
 __attribute__((constant)) int* g_constant;
Index: tools/libclang/CIndex.cpp
===================================================================
--- tools/libclang/CIndex.cpp
+++ tools/libclang/CIndex.cpp
@@ -2998,6 +2998,11 @@
       /*AllowPCHWithCompilerErrors=*/true, SkipFunctionBodies,
       /*UserFilesAreVolatile=*/true, ForSerialization, &ErrUnit));
 
+  if (!Unit && !ErrUnit) {
+    PTUI->result = CXError_ASTReadError;
+    return;
+  }
+
   if (NumErrors != Diags->getClient()->getNumErrors()) {
     // Make sure to check that 'Unit' is non-NULL.
     if (CXXIdx->getDisplayDiagnostics())
Index: unittests/ASTMatchers/ASTMatchersTest.h
===================================================================
--- unittests/ASTMatchers/ASTMatchersTest.h
+++ unittests/ASTMatchers/ASTMatchersTest.h
@@ -163,6 +163,7 @@
   std::vector<std::string> Args;
   Args.push_back("-xcuda");
   Args.push_back("-fno-ms-extensions");
+  Args.push_back("-fcuda-host-only");
   Args.push_back(CompileArg);
   if (!runToolOnCodeWithArgs(Factory->create(),
                              CudaHeader + Code, Args)) {