Index: include/clang/Driver/Action.h =================================================================== --- include/clang/Driver/Action.h +++ include/clang/Driver/Action.h @@ -41,6 +41,8 @@ enum ActionClass { InputClass = 0, BindArchClass, + CudaDeviceClass, + CudaHostClass, PreprocessJobClass, PrecompileJobClass, AnalyzeJobClass, @@ -133,6 +135,39 @@ } }; +class CudaDeviceAction : public Action { + virtual void anchor(); + /// GPU architecture to bind -- e.g sm_35 + const char *GpuArchName; + bool AtTopLevel; + +public: + CudaDeviceAction(std::unique_ptr Input, const char *ArchName, + bool AtTopLevel); + + const char *getGpuArchName() const { return GpuArchName; } + bool isAtTopLevel() const { return AtTopLevel; } + + static bool classof(const Action *A) { + return A->getKind() == CudaDeviceClass; + } +}; + +class CudaHostAction : public Action { + virtual void anchor(); + ActionList DeviceActions; + +public: + CudaHostAction(std::unique_ptr Input, + const ActionList &DeviceActions); + ~CudaHostAction() override; + + ActionList &getDeviceActions() { return DeviceActions; } + const ActionList &getDeviceActions() const { return DeviceActions; } + + static bool classof(const Action *A) { return A->getKind() == CudaHostClass; } +}; + class JobAction : public Action { virtual void anchor(); protected: Index: include/clang/Driver/CC1Options.td =================================================================== --- include/clang/Driver/CC1Options.td +++ include/clang/Driver/CC1Options.td @@ -602,6 +602,8 @@ // CUDA Options //===----------------------------------------------------------------------===// +def cuda_include_gpucode : Separate<["-"], "cuda-include-gpucode">, + HelpText<"Incorporate CUDA device-side code.">; def fcuda_is_device : Flag<["-"], "fcuda-is-device">, HelpText<"Generate code for CUDA device">; def fcuda_allow_host_calls_from_host_device : Flag<["-"], Index: include/clang/Driver/Driver.h =================================================================== --- include/clang/Driver/Driver.h +++ include/clang/Driver/Driver.h @@ -409,6 +409,9 @@ /// /// Will cache ToolChains for the life of the driver object, and create them /// on-demand. + const ToolChain &getTargetToolChain(const llvm::opt::ArgList &Args, + llvm::Triple &Target) const; + const ToolChain &getToolChain(const llvm::opt::ArgList &Args, StringRef DarwinArchName = "") const; Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -453,6 +453,10 @@ Group; def fno_crash_diagnostics : Flag<["-"], "fno-crash-diagnostics">, Group, Flags<[NoArgumentUnused]>; def fcreate_profile : Flag<["-"], "fcreate-profile">, Group; +def fcuda_device_only : Flag<["-"], "fcuda-device-only">, + HelpText<"Do device-side CUDA compilation only">; +def fcuda_host_only : Flag<["-"], "fcuda-host-only">, + HelpText<"Do host-side CUDA compilation only">; def fcxx_exceptions: Flag<["-"], "fcxx-exceptions">, Group, HelpText<"Enable C++ exceptions">, Flags<[CC1Option]>; def fcxx_modules : Flag <["-"], "fcxx-modules">, Group, @@ -1067,6 +1071,8 @@ def gsplit_dwarf : Flag<["-"], "gsplit-dwarf">, Group; def ggnu_pubnames : Flag<["-"], "ggnu-pubnames">, Group; def gdwarf_aranges : Flag<["-"], "gdwarf-aranges">, Group; +def gpu_architecture : Separate<["-"], "gpu-architecture">, + Flags<[DriverOption, HelpHidden]>, HelpText<"CUDA GPU architecture">; def headerpad__max__install__names : Joined<["-"], "headerpad_max_install_names">; def help : Flag<["-", "--"], "help">, Flags<[CC1Option,CC1AsOption]>, HelpText<"Display available options">; Index: include/clang/Driver/Types.def =================================================================== --- include/clang/Driver/Types.def +++ include/clang/Driver/Types.def @@ -44,6 +44,7 @@ TYPE("cl", CL, PP_C, "cl", "u") TYPE("cuda-cpp-output", PP_CUDA, INVALID, "cui", "u") TYPE("cuda", CUDA, PP_CUDA, "cu", "u") +TYPE("cuda", CUDA_DEVICE, PP_CUDA, "cu", "") TYPE("objective-c-cpp-output", PP_ObjC, INVALID, "mi", "u") TYPE("objc-cpp-output", PP_ObjC_Alias, INVALID, "mi", "u") TYPE("objective-c", ObjC, PP_ObjC, "m", "u") Index: include/clang/Frontend/CodeGenOptions.h =================================================================== --- include/clang/Frontend/CodeGenOptions.h +++ include/clang/Frontend/CodeGenOptions.h @@ -160,6 +160,11 @@ /// Name of the profile file to use as input for -fprofile-instr-use std::string InstrProfileInput; + /// A list of file names passed with -cuda-include-gpucode options to forward + /// to CUDA runtime back-end for incorporating them into host-side object + /// file. + std::vector CudaGpuCodeFileNames; + /// Regular expression to select optimizations for which we should enable /// optimization remarks. Transformation passes whose name matches this /// expression (and support this feature), will emit a diagnostic Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Verifier.h" #include using namespace clang; @@ -30,29 +31,61 @@ class CGNVCUDARuntime : public CGCUDARuntime { private: - llvm::Type *IntTy, *SizeTy; - llvm::PointerType *CharPtrTy, *VoidPtrTy; + llvm::Type *IntTy, *SizeTy, *VoidTy; + llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; + + /// Convenience reference to LLVM Context + llvm::LLVMContext &VMContext; + /// Convenience reference to the current module + llvm::Module &TheModule; + llvm::SmallVector FatbinHandles; llvm::Constant *getSetupArgumentFn() const; llvm::Constant *getLaunchFn() const; + /// Creates a function to register all kernel stubs generated in this module. + llvm::Function *makeRegisterKernelsFn(); + + /// Helper function that generates a constant string and returns a pointer to + /// the start of the string. The result of this function can be used anywhere + /// where the C code specifies const char*. + llvm::Constant *MakeConstantString(const std::string &Str, + const std::string &Name = "", + unsigned Alignment = 0) { + llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), + llvm::ConstantInt::get(SizeTy, 0)}; + llvm::Constant *ConstStr = + CGM.GetAddrOfConstantCString(Str, Name.c_str(), Alignment); + return llvm::ConstantExpr::getGetElementPtr(ConstStr, Zeros); + } + + void EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) override; + public: CGNVCUDARuntime(CodeGenModule &CGM); - void EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) override; + /// Creates module constructor function + llvm::Function *ModuleCtorFunction() override; + /// Creates module destructor function + llvm::Function *ModuleDtorFunction() override; }; } -CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM) { +CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) + : CGCUDARuntime(CGM), VMContext(CGM.getLLVMContext()), + TheModule(CGM.getModule()) { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); IntTy = Types.ConvertType(Ctx.IntTy); SizeTy = Types.ConvertType(Ctx.getSizeType()); + VoidTy = llvm::Type::getVoidTy(VMContext); CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); VoidPtrTy = cast(Types.ConvertType(Ctx.VoidPtrTy)); + VoidPtrPtrTy = VoidPtrTy->getPointerTo(); + } llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { @@ -68,11 +101,8 @@ llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { // cudaError_t cudaLaunch(char *) - std::vector Params; - Params.push_back(CharPtrTy); - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, - Params, false), - "cudaLaunch"); + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); } void CGNVCUDARuntime::EmitDeviceStubBody(CodeGenFunction &CGF, @@ -87,8 +117,7 @@ assert(isa(V->getType()) && "Arg type not PointerType"); ArgTypes.push_back(cast(V->getType())->getElementType()); } - llvm::StructType *ArgStackTy = llvm::StructType::get( - CGF.getLLVMContext(), ArgTypes); + llvm::StructType *ArgStackTy = llvm::StructType::get(VMContext, ArgTypes); llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); @@ -120,6 +149,158 @@ CGF.EmitBlock(EndBlock); } +/// Creates internal function to register all kernel stubs generated in this +/// module with CUDA runtime. +/// \code +/// void .cuda_register_kernels(void** GpuBlobHandle) { +/// __cudaRegisterFunction(GpuBlobHandle,Kernel0,...); +/// ... +/// __cudaRegisterFunction(GpuBlobHandle,KernelM,...); +/// } +/// \endcode +llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() { + llvm::Function *RegisterKernelsFunc = llvm::Function::Create( + llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), + llvm::GlobalValue::InternalLinkage, ".cuda_register_kernels", &TheModule); + llvm::BasicBlock *EntryBB = + llvm::BasicBlock::Create(VMContext, "entry", RegisterKernelsFunc); + CGBuilderTy Builder(VMContext); + Builder.SetInsertPoint(EntryBB); + + // void __cudaRegisterFunction(void **, const char *, char *, const char *, + // int, uint3*, uint3*, dim3*, dim3*, int*) + std::vector RegisterFuncParams = { + VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, + VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; + llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, RegisterFuncParams, false), + "__cudaRegisterFunction"); + + llvm::Argument &BlobHandlePtr = *RegisterKernelsFunc->arg_begin(); + for (llvm::Function *Kernel : EmittedKernels) { + llvm::Constant *KernelName = MakeConstantString(Kernel->getName()); + llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); + + llvm::Value *args[] = { + &BlobHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), KernelName, + KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, NullPtr, + NullPtr, NullPtr, + llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; + + Builder.CreateCall(RegisterFunc, args); + } + + Builder.CreateRetVoid(); + + llvm::verifyFunction(*RegisterKernelsFunc); + return RegisterKernelsFunc; +} + +/// Creates a global constructor function for the module: +/// \code +/// void .cuda_module_ctor(void*) { +/// Handle0 = __cudaRegisterFatBinary(GpuCodeBlob0); +/// .cuda_register_kernels(Handle0); +/// ... +/// HandleN = __cudaRegisterFatBinary(GpuCodeBlobN); +/// .cuda_register_kernels(HandleN); +/// } +/// \endcode +llvm::Function *CGNVCUDARuntime::ModuleCtorFunction() { + // void .cuda_register_kernels(void* handle); + llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn(); + // void ** __cudaRegisterFatBinary(void *); + llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), + "__cudaRegisterFatBinary"); + // struct { int magic, int version, void * gpu_blob, void * dont_care }; + llvm::StructType *FatbinWrapperTy = + llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr); + + llvm::Function *ModuleCtorFunc = llvm::Function::Create( + llvm::FunctionType::get(VoidTy, VoidPtrTy, false), + llvm::GlobalValue::InternalLinkage, ".cuda_module_ctor", &TheModule); + llvm::BasicBlock *CtorEntryBB = + llvm::BasicBlock::Create(VMContext, "entry", ModuleCtorFunc); + CGBuilderTy CtorBuilder(VMContext); + + CtorBuilder.SetInsertPoint(CtorEntryBB); + + for (const std::string &GpuCodeFileName : + CGM.getCodeGenOpts().CudaGpuCodeFileNames) { + llvm::ErrorOr> CodeOrErr = + llvm::MemoryBuffer::getFileOrSTDIN(GpuCodeFileName); + if (std::error_code EC = CodeOrErr.getError()) { + CGM.getDiags().Report(diag::err_cannot_open_file) << GpuCodeFileName + << EC.message(); + continue; + } + + // Create initialized wrapper structure that points to the loaded GPU blob. + llvm::Constant *Values[] = { + llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic. + llvm::ConstantInt::get(IntTy, 1), // Fatbin version. + MakeConstantString(CodeOrErr.get()->getBuffer(), "", 16), // Data. + llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1. + llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable( + TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage, + llvm::ConstantStruct::get(FatbinWrapperTy, Values), + ".cuda_fatbin_wrapper"); + FatbinWrapper->setAlignment(8); + + // FatbinHandle == __cudaRegisterFatBinary(&FatbinWrapper); + llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( + RegisterFatbinFunc, + CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); + llvm::GlobalVariable *FatbinHandle = new llvm::GlobalVariable( + TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(VoidPtrPtrTy), ".cuda_fatbin_handle"); + CtorBuilder.CreateStore(RegisterFatbinCall, FatbinHandle, false); + + // Call .cuda_register_kernels(FatbinHandle); + CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall); + + // Save FatbinHandle so we can unregister it in destructor. + FatbinHandles.push_back(FatbinHandle); + } + CtorBuilder.CreateRetVoid(); + llvm::verifyFunction(*ModuleCtorFunc); + return ModuleCtorFunc; +} + +/// Creates a global destructor function that unregisters all GPU code blobs +/// registered by constructor. +/// \code +/// void .cuda_module_dtor(void*) { +/// __cudaUnregisterFatBinary(Handle0); +/// ... +/// __cudaUnregisterFatBinary(HandleN); +/// } +/// \endcode +llvm::Function *CGNVCUDARuntime::ModuleDtorFunction() { + // void __cudaUnregisterFatBinary(void ** handle); + llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), + "__cudaUnregisterFatBinary"); + + llvm::Function *ModuleDtorFunc = llvm::Function::Create( + llvm::FunctionType::get(VoidTy, VoidPtrTy, false), + llvm::GlobalValue::InternalLinkage, ".cuda_module_dtor", &TheModule); + llvm::BasicBlock *DtorEntryBB = + llvm::BasicBlock::Create(VMContext, "entry", ModuleDtorFunc); + CGBuilderTy DtorBuilder(VMContext); + DtorBuilder.SetInsertPoint(DtorEntryBB); + + for (llvm::GlobalVariable *FatbinHandle : FatbinHandles) { + DtorBuilder.CreateCall(UnregisterFatbinFunc, + DtorBuilder.CreateLoad(FatbinHandle, false)); + } + + DtorBuilder.CreateRetVoid(); + llvm::verifyFunction(*ModuleDtorFunc); + return ModuleDtorFunc; +} + CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { return new CGNVCUDARuntime(CGM); } Index: lib/CodeGen/CGCUDARuntime.h =================================================================== --- lib/CodeGen/CGCUDARuntime.h +++ lib/CodeGen/CGCUDARuntime.h @@ -16,6 +16,13 @@ #ifndef LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H #define LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H +#include "llvm/ADT/SmallVector.h" + +namespace llvm { +class Function; +class GlobalVariable; +} + namespace clang { class CUDAKernelCallExpr; @@ -32,6 +39,8 @@ protected: CodeGenModule &CGM; + llvm::SmallVector EmittedKernels; + public: CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {} virtual ~CGCUDARuntime(); @@ -39,10 +48,23 @@ virtual RValue EmitCUDAKernelCallExpr(CodeGenFunction &CGF, const CUDAKernelCallExpr *E, ReturnValueSlot ReturnValue); - + + /// Adds CGF.CurFn to EmittedKernels and calls EmitDeviceStubBody() to emit a + /// kernel launch stub. + virtual void EmitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args); + + /// Constructs and returns a module initialization function or nullptr if it's + /// not needed. Must be called after all kernels have been emitted. + virtual llvm::Function *ModuleCtorFunction() = 0; + + /// Returns a module cleanup function or nullptr if it's not needed. + /// Must be called after ModuleCtorFunction + virtual llvm::Function *ModuleDtorFunction() = 0; + +private: + /// Emits kernel launch stub. virtual void EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) = 0; - }; /// Creates an instance of a CUDA runtime class. Index: lib/CodeGen/CGCUDARuntime.cpp =================================================================== --- lib/CodeGen/CGCUDARuntime.cpp +++ lib/CodeGen/CGCUDARuntime.cpp @@ -53,3 +53,9 @@ return RValue::get(nullptr); } + +void CGCUDARuntime::EmitDeviceStub(CodeGenFunction &CGF, + FunctionArgList &Args) { + EmittedKernels.push_back(CGF.CurFn); + EmitDeviceStubBody(CGF, Args); +} Index: lib/CodeGen/CodeGenFunction.cpp =================================================================== --- lib/CodeGen/CodeGenFunction.cpp +++ lib/CodeGen/CodeGenFunction.cpp @@ -858,7 +858,7 @@ else if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice && FD->hasAttr()) - CGM.getCUDARuntime().EmitDeviceStubBody(*this, Args); + CGM.getCUDARuntime().EmitDeviceStub(*this, Args); else if (isa(FD) && cast(FD)->isLambdaToBlockPointerConversion()) { // The lambda conversion to block pointer is special; the semantics can't be Index: lib/CodeGen/CodeGenModule.cpp =================================================================== --- lib/CodeGen/CodeGenModule.cpp +++ lib/CodeGen/CodeGenModule.cpp @@ -350,6 +350,13 @@ if (ObjCRuntime) if (llvm::Function *ObjCInitFunction = ObjCRuntime->ModuleInitFunction()) AddGlobalCtor(ObjCInitFunction); + if (Context.getLangOpts().CUDA && !Context.getLangOpts().CUDAIsDevice && + CUDARuntime) { + if (llvm::Function *CudaCtorFunction = CUDARuntime->ModuleCtorFunction()) + AddGlobalCtor(CudaCtorFunction); + if (llvm::Function *CudaDtorFunction = CUDARuntime->ModuleDtorFunction()) + AddGlobalDtor(CudaDtorFunction); + } if (PGOReader && PGOStats.hasDiagnostics()) PGOStats.reportDiagnostics(getDiags(), getCodeGenOpts().MainFileName); EmitCtorList(GlobalCtors, "llvm.global_ctors"); Index: lib/Driver/Action.cpp =================================================================== --- lib/Driver/Action.cpp +++ lib/Driver/Action.cpp @@ -24,6 +24,8 @@ switch (AC) { case InputClass: return "input"; case BindArchClass: return "bind-arch"; + case CudaDeviceClass: return "cuda-device"; + case CudaHostClass: return "cuda-host"; case PreprocessJobClass: return "preprocessor"; case PrecompileJobClass: return "precompiler"; case AnalyzeJobClass: return "analyzer"; @@ -53,6 +55,25 @@ const char *_ArchName) : Action(BindArchClass, std::move(Input)), ArchName(_ArchName) {} +void CudaDeviceAction::anchor() {} + +CudaDeviceAction::CudaDeviceAction(std::unique_ptr Input, + const char *ArchName, bool AtTopLevel) + : Action(CudaDeviceClass, std::move(Input)), GpuArchName(ArchName), + AtTopLevel(AtTopLevel) {} + +void CudaHostAction::anchor() {} + +CudaHostAction::CudaHostAction(std::unique_ptr Input, + const ActionList &_DeviceActions) + : Action(CudaHostClass, std::move(Input)), DeviceActions(_DeviceActions) {} + +CudaHostAction::~CudaHostAction() { + for (iterator it = DeviceActions.begin(), ie = DeviceActions.end(); it != ie; + ++it) + delete *it; +} + void JobAction::anchor() {} JobAction::JobAction(ActionClass Kind, std::unique_ptr Input, Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -180,10 +180,11 @@ } else if ((PhaseArg = DAL.getLastArg(options::OPT_S))) { FinalPhase = phases::Backend; - // -c only runs up to the assembler. - } else if ((PhaseArg = DAL.getLastArg(options::OPT_c))) { + // -c and partial CUDA compilations only runs up to the assembler. + } else if ((PhaseArg = DAL.getLastArg(options::OPT_c)) || + (PhaseArg = DAL.getLastArg(options::OPT_fcuda_device_only)) || + (PhaseArg = DAL.getLastArg(options::OPT_fcuda_host_only))) { FinalPhase = phases::Assemble; - // Otherwise do everything. } else FinalPhase = phases::Link; @@ -819,7 +820,25 @@ } static unsigned PrintActions1(const Compilation &C, Action *A, - std::map &Ids) { + std::map &Ids); + +static std::string PrintActionList(const Compilation &C, ActionList &AL, + std::map &Ids) { + std::string str; + llvm::raw_string_ostream os(str); + os << "{"; + for (Action::iterator it = AL.begin(), ie = AL.end(); it != ie;) { + os << PrintActions1(C, *it, Ids); + ++it; + if (it != ie) + os << ", "; + } + os << "}"; + return str; +} + +static unsigned PrintActions1(const Compilation &C, Action *A, + std::map &Ids) { if (Ids.count(A)) return Ids[A]; @@ -832,15 +851,14 @@ } else if (BindArchAction *BIA = dyn_cast(A)) { os << '"' << BIA->getArchName() << '"' << ", {" << PrintActions1(C, *BIA->begin(), Ids) << "}"; + } else if (CudaDeviceAction *CDA = dyn_cast(A)) { + os << '"' << CDA->getGpuArchName() << '"' << ", {" + << PrintActions1(C, *CDA->begin(), Ids) << "}"; + } else if (CudaHostAction *CHA = dyn_cast(A)) { + os << "{" << PrintActions1(C, *CHA->begin(), Ids) << "}" + << ", gpucode " << PrintActionList(C, CHA->getDeviceActions(), Ids); } else { - os << "{"; - for (Action::iterator it = A->begin(), ie = A->end(); it != ie;) { - os << PrintActions1(C, *it, Ids); - ++it; - if (it != ie) - os << ", "; - } - os << "}"; + os << PrintActionList(C, A->getInputs(), Ids); } unsigned Id = Ids.size(); @@ -1149,6 +1167,77 @@ } } +// For eash unique --gpu-architecture argument creates a TY_CUDA_DEVICE input +// action and then wraps each in CudaDeviceAction paired with appropriate GPU +// arch name. If we're only building device-side code, each action remains +// independent. Otherwise we pass device-side actions as inputs to a new +// CudaHostAction which combines both host and device side actions. +static std::unique_ptr +BuildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args, + const Arg *InputArg, const types::ID InputType, + std::unique_ptr Current, ActionList &Actions) { + + assert(InputType == types::TY_CUDA && + "CUDA Actions only apply to CUDA inputs."); + + SmallVector GpuArchList; + llvm::StringSet<> GpuArchNames; + for (Arg *A : Args) { + if (A->getOption().matches(options::OPT_gpu_architecture)) { + A->claim(); + if (GpuArchNames.insert(A->getValue()).second) + GpuArchList.push_back(A->getValue()); + } + } + + if (GpuArchList.empty()) + GpuArchList.push_back("sm_20"); + + Driver::InputList CudaDeviceInputs; + for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) + CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg)); + + ActionList CudaDeviceActions; + D.BuildActions(TC, Args, CudaDeviceInputs, CudaDeviceActions); + assert(GpuArchList.size() == CudaDeviceActions.size() && + "Failed to create actions for all devices"); + + bool PartialCompilation = false; + bool DeviceOnlyCompilation = Args.hasArg(options::OPT_fcuda_device_only); + for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) { + if (CudaDeviceActions[i]->getKind() != Action::BackendJobClass) { + PartialCompilation = true; + break; + } + } + + if (PartialCompilation || DeviceOnlyCompilation) { + // If -o specified we can only work if it's device-only compilation for a + // single device. + if (Args.hasArg(options::OPT_o) && + (!DeviceOnlyCompilation || GpuArchList.size() > 1)){ + D.Diag(clang::diag::err_drv_output_argument_with_multiple_files); + return nullptr; + } + for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) + Actions.push_back(new CudaDeviceAction( + std::unique_ptr(CudaDeviceActions[i]), GpuArchList[i], true)); + if (DeviceOnlyCompilation) + Current.reset(nullptr); + return Current; + } else { + ActionList CudaDeviceJobActions; + for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) + CudaDeviceJobActions.push_back( + new CudaDeviceAction(std::unique_ptr(CudaDeviceActions[i]), + GpuArchList[i], false)); + + std::unique_ptr HostAction( + new CudaHostAction(std::move(Current), CudaDeviceJobActions)); + return HostAction; + } +} + void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args, const InputList &Inputs, ActionList &Actions) const { llvm::PrettyStackTraceString CrashInfo("Building compilation actions"); @@ -1251,8 +1340,26 @@ // Build the pipeline for this file. std::unique_ptr Current(new InputAction(*InputArg, InputType)); - for (SmallVectorImpl::iterator - i = PL.begin(), e = PL.end(); i != e; ++i) { + phases::ID CudaInjectionPhase; + if (isSaveTempsEnabled()) { + // All phases are done independently, inject GPU blobs during compilation + // phase as that's where we generate glue code to init them. + CudaInjectionPhase = phases::Compile; + } else { + // Assumes that clang does everything up until linking phase, so we inject + // cuda device actions at the last step before linking. Otherwise CUDA + // host action forces preprocessor into a separate invocation. + if (FinalPhase == phases::Link) { + for (auto i = PL.begin(), e = PL.end(); i != e; ++i) { + auto next = i + 1; + if (next != e && *next == phases::Link) + CudaInjectionPhase = *i; + } + } else + CudaInjectionPhase = FinalPhase; + } + for (SmallVectorImpl::iterator i = PL.begin(), e = PL.end(); + i != e; ++i) { phases::ID Phase = *i; // We are done if this step is past what the user requested. @@ -1274,6 +1381,15 @@ // Otherwise construct the appropriate action. Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current)); + + if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase && + !Args.hasArg(options::OPT_fcuda_host_only)) { + Current = BuildCudaActions(*this, TC, Args, InputArg, InputType, + std::move(Current), Actions); + if (!Current) + break; + } + if (Current->getType() == types::TY_Nothing) break; } @@ -1403,10 +1519,14 @@ if (A->getType() != types::TY_Nothing) ++NumOutputs; +#if DISABLED_FOR_NOW + // TODO: Cuda compilation has more than one input. Need to figure out how to + // detect whether it's a cuda compilation. if (NumOutputs > 1) { Diag(clang::diag::err_drv_output_argument_with_multiple_files); FinalOutput = nullptr; } +#endif } // Collect the list of architectures. @@ -1521,7 +1641,13 @@ if (isa(JA)) { // Check if the compiler supports emitting LLVM IR. assert(Inputs->size() == 1); - JobAction *CompileJA = cast(*Inputs->begin()); + JobAction *CompileJA; + // Extract real host action, if it's a CudaHostAction. + if (CudaHostAction *CudaHA = dyn_cast(*Inputs->begin())) + CompileJA = cast(*CudaHA->begin()); + else + CompileJA = cast(*Inputs->begin()); + const Tool *Compiler = TC->SelectTool(*CompileJA); if (!Compiler) return nullptr; @@ -1549,6 +1675,10 @@ return ToolForJob; } +static llvm::Triple computeTargetTriple(StringRef DefaultTargetTriple, + const ArgList &Args, + StringRef DarwinArchName); + void Driver::BuildJobsForAction(Compilation &C, const Action *A, const ToolChain *TC, @@ -1559,6 +1689,20 @@ InputInfo &Result) const { llvm::PrettyStackTraceString CrashInfo("Building compilation jobs"); + InputInfoList CudaDeviceInputInfos; + if (const CudaHostAction *CHA = dyn_cast(A)) { + InputInfo II; + // Append outputs of device jobs to the input list. + for (const Action *DA : CHA->getDeviceActions()) { + BuildJobsForAction(C, DA, TC, "", AtTopLevel, + /*MultipleArchs*/ false, LinkingOutput, II); + CudaDeviceInputInfos.push_back(II); + } + // Override current action with a real host compile action and continue + // processing it. + A = *CHA->begin(); + } + if (const InputAction *IA = dyn_cast(A)) { // FIXME: It would be nice to not claim this here; maybe the old scheme of // just using Args was better? @@ -1581,8 +1725,21 @@ else TC = &C.getDefaultToolChain(); - BuildJobsForAction(C, *BAA->begin(), TC, BAA->getArchName(), - AtTopLevel, MultipleArchs, LinkingOutput, Result); + BuildJobsForAction(C, *BAA->begin(), TC, ArchName, AtTopLevel, + MultipleArchs, LinkingOutput, Result); + return; + } + + if (const CudaDeviceAction *CDA = dyn_cast(A)) { + const ToolChain *TC; + const char *ArchName = CDA->getGpuArchName(); + llvm::Triple HostTriple = + computeTargetTriple(DefaultTargetTriple, C.getArgs(), ""); + llvm::Triple TargetTriple(HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda" + : "nvptx-nvidia-cuda"); + TC = &getTargetToolChain(C.getArgs(), TargetTriple); + BuildJobsForAction(C, *CDA->begin(), TC, ArchName, CDA->isAtTopLevel(), + /*MultipleArchs*/ true, LinkingOutput, Result); return; } @@ -1617,6 +1774,10 @@ if (JA->getType() == types::TY_dSYM) BaseInput = InputInfos[0].getFilename(); + // Append outputs of cuda device jobs to the input list + if (CudaDeviceInputInfos.size()) + InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end()); + // Determine the place to write output to, if any. if (JA->getType() == types::TY_Nothing) Result = InputInfo(A->getType(), BaseInput); @@ -2022,10 +2183,8 @@ return Target; } -const ToolChain &Driver::getToolChain(const ArgList &Args, - StringRef DarwinArchName) const { - llvm::Triple Target = computeTargetTriple(DefaultTargetTriple, Args, - DarwinArchName); +const ToolChain &Driver::getTargetToolChain(const ArgList &Args, + llvm::Triple &Target) const { ToolChain *&TC = ToolChains[Target.str()]; if (!TC) { @@ -2095,6 +2254,9 @@ break; } break; + case llvm::Triple::CUDA: + TC = new toolchains::Cuda(*this, Target, Args); + break; default: // TCE is an OSless target if (Target.getArchName() == "tce") { @@ -2125,6 +2287,13 @@ return *TC; } +const ToolChain &Driver::getToolChain(const ArgList &Args, + StringRef DarwinArchName) const { + llvm::Triple Target = + computeTargetTriple(DefaultTargetTriple, Args, DarwinArchName); + return getTargetToolChain(Args, Target); +} + bool Driver::ShouldUseClangCompiler(const JobAction &JA) const { // Check if user requested no clang, or clang doesn't understand this type (we // only handle single inputs for now). Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -151,6 +151,8 @@ case Action::InputClass: case Action::BindArchClass: + case Action::CudaDeviceClass: + case Action::CudaHostClass: case Action::LipoJobClass: case Action::DsymutilJobClass: case Action::VerifyDebugInfoJobClass: Index: lib/Driver/ToolChains.h =================================================================== --- lib/Driver/ToolChains.h +++ lib/Driver/ToolChains.h @@ -690,6 +690,18 @@ std::string computeSysRoot() const; }; +class LLVM_LIBRARY_VISIBILITY Cuda : public Linux { +public: + Cuda(const Driver &D, const llvm::Triple &Triple, + const llvm::opt::ArgList &Args); + + llvm::opt::DerivedArgList * + TranslateArgs(const llvm::opt::DerivedArgList &Args, + const char *BoundArch) const override; + void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const override; +}; + class LLVM_LIBRARY_VISIBILITY Hexagon_TC : public Linux { protected: GCCVersion GCCLibAndIncVersion; Index: lib/Driver/ToolChains.cpp =================================================================== --- lib/Driver/ToolChains.cpp +++ lib/Driver/ToolChains.cpp @@ -3600,6 +3600,62 @@ return new tools::dragonfly::Link(*this); } +/// Stub for CUDA toolchain. At the moment we don't have assembler or +/// linker and need toolchain mainly to propagate device-side options +/// to CC1. + +Cuda::Cuda(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) + : Linux(D, Triple, Args) {} + +void Cuda::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const { + Linux::addClangTargetOptions(DriverArgs, CC1Args); + CC1Args.push_back("-fcuda-is-device"); +} + +llvm::opt::DerivedArgList * +Cuda::TranslateArgs(const llvm::opt::DerivedArgList &Args, + const char *BoundArch) const { + DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs()); + const OptTable &Opts = getDriver().getOpts(); + + for (Arg *A : Args) { + if (A->getOption().matches(options::OPT_Xarch__)) { + // Skip this argument unless the architecture matches BoundArch + if (A->getValue(0) != StringRef(BoundArch)) + continue; + + unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1)); + unsigned Prev = Index; + std::unique_ptr XarchArg(Opts.ParseOneArg(Args, Index)); + + // If the argument parsing failed or more than one argument was + // consumed, the -Xarch_ argument's parameter tried to consume + // extra arguments. Emit an error and ignore. + // + // We also want to disallow any options which would alter the + // driver behavior; that isn't going to work in our model. We + // use isDriverOption() as an approximation, although things + // like -O4 are going to slip through. + if (!XarchArg || Index > Prev + 1) { + getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args) + << A->getAsString(Args); + continue; + } else if (XarchArg->getOption().hasFlag(options::DriverOption)) { + getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver) + << A->getAsString(Args); + continue; + } + XarchArg->setBaseArg(A); + A = XarchArg.release(); + DAL->AddSynthesizedArg(A); + } + DAL->append(A); + } + + DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch); + return DAL; +} /// XCore tool chain XCore::XCore(const Driver &D, const llvm::Triple &Triple, Index: lib/Driver/Tools.h =================================================================== --- lib/Driver/Tools.h +++ lib/Driver/Tools.h @@ -41,6 +41,8 @@ public: static const char *getBaseInputName(const llvm::opt::ArgList &Args, const InputInfoList &Inputs); + static const char *getBaseInputName(const llvm::opt::ArgList &Args, + const InputInfo &Input); static const char *getBaseInputStem(const llvm::opt::ArgList &Args, const InputInfoList &Inputs); static const char *getDependencyFileName(const llvm::opt::ArgList &Args, Index: lib/Driver/Tools.cpp =================================================================== --- lib/Driver/Tools.cpp +++ lib/Driver/Tools.cpp @@ -1517,6 +1517,12 @@ return CPUName; } + case llvm::Triple::nvptx: + case llvm::Triple::nvptx64: + if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) + return A->getValue(); + return ""; + case llvm::Triple::ppc: case llvm::Triple::ppc64: case llvm::Triple::ppc64le: { @@ -2572,8 +2578,30 @@ bool IsWindowsCygnus = getToolChain().getTriple().isWindowsCygwinEnvironment(); bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment(); + bool IsCuda = false; - assert(Inputs.size() == 1 && "Unable to handle multiple inputs."); + assert(Inputs.size() >= 1 && "Must have at least one input."); + InputInfoList BaseInputs; // Inputs[0] + InputInfoList CudaInputs; // Inputs[1...] + const InputInfo &Input = Inputs[0]; + BaseInputs.push_back(Input); + + if (Inputs.size() > 1) { + // Cuda compilation mode may pass more than one file. + // Verify that all additional files were derived from the same source. + IsCuda = true; + StringRef BaseInput(Input.getBaseInput()); + for (const auto &it : Inputs) { + if (BaseInput != StringRef(it.getBaseInput())) { + IsCuda = false; + break; + } + } + if (IsCuda) + CudaInputs.append(std::next(Inputs.begin()), Inputs.end()); + } + + assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs."); // Invoke ourselves in -cc1 mode. // @@ -2681,7 +2709,7 @@ // Set the main file name, so that debug info works even with // -save-temps. CmdArgs.push_back("-main-file-name"); - CmdArgs.push_back(getBaseInputName(Args, Inputs)); + CmdArgs.push_back(getBaseInputName(Args, Input)); // Some flags which affect the language (via preprocessor // defines). @@ -2709,7 +2737,7 @@ CmdArgs.push_back("-analyzer-checker=deadcode"); - if (types::isCXX(Inputs[0].getType())) + if (types::isCXX(Input.getType())) CmdArgs.push_back("-analyzer-checker=cplusplus"); // Enable the following experimental checkers for testing. @@ -3237,7 +3265,7 @@ // Explicitly error on some things we know we don't support and can't just // ignore. - types::ID InputType = Inputs[0].getType(); + types::ID InputType = Input.getType(); if (!Args.hasArg(options::OPT_fallow_unsupported)) { Arg *Unsupported; if (types::isCXX(InputType) && @@ -4609,7 +4637,7 @@ assert(Output.isNothing() && "Invalid output."); } - for (const auto &II : Inputs) { + for (const auto &II : BaseInputs) { addDashXForInput(Args, II, CmdArgs); if (II.isFilename()) @@ -4650,16 +4678,26 @@ const char *SplitDwarfOut; if (SplitDwarf) { CmdArgs.push_back("-split-dwarf-file"); - SplitDwarfOut = SplitDebugName(Args, Inputs); + SplitDwarfOut = SplitDebugName(Args, BaseInputs); CmdArgs.push_back(SplitDwarfOut); } + // Include device-side CUDA code + if (IsCuda) { + for (InputInfoList::const_iterator it = CudaInputs.begin(), + ie = CudaInputs.end(); + it != ie; ++it) { + CmdArgs.push_back("-cuda-include-gpucode"); + CmdArgs.push_back(it->getFilename()); + } + } + // Finally add the compile command to the compilation. if (Args.hasArg(options::OPT__SLASH_fallback) && Output.getType() == types::TY_Object && (InputType == types::TY_C || InputType == types::TY_CXX)) { - auto CLCommand = - getCLFallback()->GetCommand(C, JA, Output, Inputs, Args, LinkingOutput); + auto CLCommand = getCLFallback()->GetCommand(C, JA, Output, BaseInputs, + Args, LinkingOutput); C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, std::move(CLCommand))); } else { @@ -5717,9 +5755,13 @@ } const char *Clang::getBaseInputName(const ArgList &Args, + const InputInfo &Input) { + return Args.MakeArgString(llvm::sys::path::filename(Input.getBaseInput())); +} + +const char *Clang::getBaseInputName(const ArgList &Args, const InputInfoList &Inputs) { - return Args.MakeArgString( - llvm::sys::path::filename(Inputs[0].getBaseInput())); + return getBaseInputName(Args, Inputs[0]); } const char *Clang::getBaseInputStem(const ArgList &Args, Index: lib/Driver/Types.cpp =================================================================== --- lib/Driver/Types.cpp +++ lib/Driver/Types.cpp @@ -86,6 +86,7 @@ case TY_C: case TY_PP_C: case TY_CL: case TY_CUDA: case TY_PP_CUDA: + case TY_CUDA_DEVICE: case TY_ObjC: case TY_PP_ObjC: case TY_PP_ObjC_Alias: case TY_CXX: case TY_PP_CXX: case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias: @@ -122,7 +123,7 @@ case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias: case TY_CXXHeader: case TY_PP_CXXHeader: case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader: - case TY_CUDA: case TY_PP_CUDA: + case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE: return true; } } @@ -206,10 +207,12 @@ P.push_back(phases::Compile); P.push_back(phases::Backend); } - P.push_back(phases::Assemble); + if (Id != TY_CUDA_DEVICE) + P.push_back(phases::Assemble); } } - if (!onlyPrecompileType(Id)) { + + if (!onlyPrecompileType(Id) && Id != TY_CUDA_DEVICE) { P.push_back(phases::Link); } assert(0 < P.size() && "Not enough phases in list"); Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -639,6 +639,8 @@ Args.getAllArgValues(OPT_fsanitize_recover_EQ), Diags, Opts.SanitizeRecover); + Opts.CudaGpuCodeFileNames = Args.getAllArgValues(OPT_cuda_include_gpucode); + return Success; } Index: test/Driver/cuda-options.cu =================================================================== --- /dev/null +++ test/Driver/cuda-options.cu @@ -0,0 +1,108 @@ +// Tests CUDA compilation pipeline construction in Driver. + +// Simple compilation case: +// RUN: %clang -### -nocudainc -c %s 2>&1 \ +// Compile device-side to PTX assembly and make sure we use it on the host side. +// RUN: | FileCheck -check-prefix CUDA-D1 \ +// Then compile host side and incorporate device code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Typical compilation + link case: +// RUN: %clang -### -nocudainc %s 2>&1 \ +// Compile device-side to PTX assembly and make sure we use it on the host side +// RUN: | FileCheck -check-prefix CUDA-D1 \ +// Then compile host side and incorporate device code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ +// Then link things. +// RUN: -check-prefix CUDA-L %s + +// Verify that -cuda-no-device disables device-side compilation and linking +// RUN: %clang -### -nocudainc -fcuda-host-only %s 2>&1 \ +// Make sure we didn't run device-side compilation. +// RUN: | FileCheck -check-prefix CUDA-ND \ +// Then compile host side and make sure we don't attempt to incorporate GPU code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-NI \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Verify that -cuda-no-host disables host-side compilation and linking +// RUN: %clang -### -nocudainc -fcuda-device-only %s 2>&1 \ +// Compile device-side to PTX assembly +// RUN: | FileCheck -check-prefix CUDA-D1 \ +// Make sure there are no host cmpilation or linking. +// RUN: -check-prefix CUDA-NH -check-prefix CUDA-NL %s + +// Verify that with -S we compile host and device sides to assembly +// and incorporate device code on the host side. +// RUN: %clang -### -nocudainc -S -c %s 2>&1 \ +// Compile device-side to PTX assembly +// RUN: | FileCheck -check-prefix CUDA-D1 \ +// Then compile host side and incorporate GPU code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Verify that --gpu-architecture option passes correct GPU +// archtecture info to device compilation. +// RUN: %clang -### -nocudainc -gpu-architecture sm_35 -c %s 2>&1 \ +// Compile device-side to PTX assembly. +// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \ +// Then compile host side and incorporate GPU code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Verify that there is device-side compilation per --gpu-architecture args +// and that all results are included on the host side. +// RUN: %clang -### -nocudainc -gpu-architecture sm_35 -gpu-architecture sm_30 -c %s 2>&1 \ +// Compile both device-sides to PTX assembly +// RUN: | FileCheck \ +// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \ +// RUN: -check-prefix CUDA-D2 -check-prefix CUDA-D2-SM30 \ +// Then compile host side and incorporate both device-side outputs +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Match device-side compilation +// CUDA-D1: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda" +// CUDA-D1-SAME: "-fcuda-is-device" +// CUDA-D1-SM35-SAME: "-target-cpu" "sm_35" +// CUDA-D1-SAME: "-o" "[[GPUCODE1:[^"]*]]" +// CUDA-D1-SAME: "-x" "cuda" + +// Match anothe device-side compilation +// CUDA-D2: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda" +// CUDA-D2-SAME: "-fcuda-is-device" +// CUDA-D2-SM30-SAME: "-target-cpu" "sm_30" +// CUDA-D2-SAME: "-o" "[[GPUCODE2:[^"]*]]" +// CUDA-D2-SAME: "-x" "cuda" + +// Match no device-side compilation +// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda" +// CUDA-ND-SAME-NOT: "-fcuda-is-device" + +// Match host-side compilation +// CUDA-H: "-cc1" "-triple" +// CUDA-H-SAME-NOT: "nvptx{{64?}}-nvidia-cuda" +// CUDA-H-SAME-NOT: "-fcuda-is-device" +// CUDA-H-SAME: "-o" "[[HOSTOBJ:[^"]*]]" +// CUDA-H-SAME: "-x" "cuda" +// CUDA-H-I1-SAME: "-cuda-include-gpucode" "[[GPUCODE1]]" +// CUDA-H-I2-SAME: "-cuda-include-gpucode" "[[GPUCODE2]]" + +// Match no GPU code inclusion. +// CUDA-H-NI-NOT: "-cuda-include-gpucode" + +// Match no CUDA compilation +// CUDA-NH-NOT: "-cc1" "-triple" +// CUDA-NH-SAME-NOT: "-x" "cuda" + +// Match linker +// CUDA-L: "{{.*}}ld{{(.exe)?}}" +// CUDA-L-SAME: "[[HOSTOBJ]]" + +// Match no linker +// CUDA-NL-NOT: "{{.*}}ld{{(.exe)?}}" Index: test/Index/attributes-cuda.cu =================================================================== --- test/Index/attributes-cuda.cu +++ test/Index/attributes-cuda.cu @@ -1,5 +1,5 @@ -// RUN: c-index-test -test-load-source all -x cuda %s | FileCheck %s - +// RUN: c-index-test -test-load-source all -x cuda -nocudainc -fcuda-host-only %s | FileCheck %s +// RUN: c-index-test -test-load-source all -x cuda -nocudainc -fcuda-device-only %s | FileCheck %s __attribute__((device)) void f_device(); __attribute__((global)) void f_global(); __attribute__((constant)) int* g_constant; Index: tools/libclang/CIndex.cpp =================================================================== --- tools/libclang/CIndex.cpp +++ tools/libclang/CIndex.cpp @@ -2998,6 +2998,11 @@ /*AllowPCHWithCompilerErrors=*/true, SkipFunctionBodies, /*UserFilesAreVolatile=*/true, ForSerialization, &ErrUnit)); + if (!Unit && !ErrUnit) { + PTUI->result = CXError_ASTReadError; + return; + } + if (NumErrors != Diags->getClient()->getNumErrors()) { // Make sure to check that 'Unit' is non-NULL. if (CXXIdx->getDisplayDiagnostics()) Index: unittests/ASTMatchers/ASTMatchersTest.h =================================================================== --- unittests/ASTMatchers/ASTMatchersTest.h +++ unittests/ASTMatchers/ASTMatchersTest.h @@ -163,6 +163,7 @@ std::vector Args; Args.push_back("-xcuda"); Args.push_back("-fno-ms-extensions"); + Args.push_back("-fcuda-host-only"); Args.push_back(CompileArg); if (!runToolOnCodeWithArgs(Factory->create(), CudaHeader + Code, Args)) {