This is an archive of the discontinued LLVM Phabricator instance.

lib/Driver/ToolChains/Clang.cpp
4664–4665	Nit: Passing multiple things as a single input may need some more details. E.g. `...receives all device-side outputs in a single fatbin as Inputs[1]`
lib/Frontend/CompilerInvocation.cpp
1048–1050	If more than one gpu binary is passed, all but last will be ignored. IMO in this case we would want to either warn that some inputs were ignored or report an error that there is more than one GPU binary.

Hahnfeld added inline comments.Feb 20 2018, 10:24 AM

lib/Frontend/CompilerInvocation.cpp
1048–1050	Well, `-fcuda-include-gpubinary` is only recognized on `cc1`. I think we can assume that we are correctly assembling our command line, can't we? (Nobody else checks the options here...)

tra accepted this revision.Feb 20 2018, 10:43 AM

tra added inline comments.

lib/Frontend/CompilerInvocation.cpp
1048–1050	Fair enough. Assert, then?

This revision is now accepted and ready to land.Feb 20 2018, 10:43 AM

Hahnfeld added inline comments.Feb 20 2018, 10:51 AM

lib/Frontend/CompilerInvocation.cpp
1048–1050	I added an `assert` in `lib/Driver/ToolChains/Clang.cpp` where we are constructing the command line. I think that guarantees that we are getting only a single argument.

Update comment.

Closed by commit rL326342: [CUDA] Include single GPU binary, NFCI. (authored by Hahnfeld). · Explain WhyFeb 28 2018, 9:56 AM

This revision was automatically updated to reflect the committed changes.

Herald added a subscriber: llvm-commits. · View Herald TranscriptFeb 28 2018, 9:56 AM

Revision Contents

Path

Size

include/

clang/

Frontend/

CodeGenOptions.h

7 lines

lib/

CodeGen/

CGCUDANV.cpp

135 lines

Driver/

ToolChains/

Clang.cpp

11 lines

Frontend/

CompilerInvocation.cpp

4 lines

test/

Driver/

cuda-options.cu

23 lines

Diff 135649

include/clang/Frontend/CodeGenOptions.h

Show First 20 Lines • Show All 199 Lines • ▼ Show 20 Lines	public:
/// importing.		/// importing.
std::string ThinLTOIndexFile;		std::string ThinLTOIndexFile;

/// Name of a file that can optionally be written with minimized bitcode		/// Name of a file that can optionally be written with minimized bitcode
/// to be used as input for the ThinLTO thin link step, which only needs		/// to be used as input for the ThinLTO thin link step, which only needs
/// the summary and module symbol table (and not, e.g. any debug metadata).		/// the summary and module symbol table (and not, e.g. any debug metadata).
std::string ThinLinkBitcodeFile;		std::string ThinLinkBitcodeFile;

/// A list of file names passed with -fcuda-include-gpubinary options to		/// Name of file passed with -fcuda-include-gpubinary option to forward to
/// forward to CUDA runtime back-end for incorporating them into host-side		/// CUDA runtime back-end for incorporating them into host-side object file.
/// object file.		std::string CudaGpuBinaryFileName;
std::vector<std::string> CudaGpuBinaryFileNames;

/// The name of the file to which the backend should save YAML optimization		/// The name of the file to which the backend should save YAML optimization
/// records.		/// records.
std::string OptRecordFile;		std::string OptRecordFile;

/// Regular expression to select optimizations for which we should enable		/// Regular expression to select optimizations for which we should enable
/// optimization remarks. Transformation passes whose name matches this		/// optimization remarks. Transformation passes whose name matches this
/// expression (and support this feature), will emit a diagnostic		/// expression (and support this feature), will emit a diagnostic
▲ Show 20 Lines • Show All 85 Lines • Show Last 20 Lines

lib/CodeGen/CGCUDANV.cpp

Show All 35 Lines	private:

/// Convenience reference to LLVM Context		/// Convenience reference to LLVM Context
llvm::LLVMContext &Context;		llvm::LLVMContext &Context;
/// Convenience reference to the current module		/// Convenience reference to the current module
llvm::Module &TheModule;		llvm::Module &TheModule;
/// Keeps track of kernel launch stubs emitted in this module		/// Keeps track of kernel launch stubs emitted in this module
llvm::SmallVector<llvm::Function *, 16> EmittedKernels;		llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;		llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
/// Keeps track of variables containing handles of GPU binaries. Populated by		/// Keeps track of variable containing handle of GPU binary. Populated by
/// ModuleCtorFunction() and used to create corresponding cleanup calls in		/// ModuleCtorFunction() and used to create corresponding cleanup calls in
/// ModuleDtorFunction()		/// ModuleDtorFunction()
llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles;		llvm::GlobalVariable *GpuBinaryHandle = nullptr;

llvm::Constant *getSetupArgumentFn() const;		llvm::Constant *getSetupArgumentFn() const;
llvm::Constant *getLaunchFn() const;		llvm::Constant *getLaunchFn() const;

/// Creates a function to register all kernel stubs generated in this module.		/// Creates a function to register all kernel stubs generated in this module.
llvm::Function *makeRegisterGlobalsFn();		llvm::Function *makeRegisterGlobalsFn();

/// Helper function that generates a constant string and returns a pointer to		/// Helper function that generates a constant string and returns a pointer to
▲ Show 20 Lines • Show All 184 Lines • ▼ Show 20 Lines	llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {

Builder.CreateRetVoid();		Builder.CreateRetVoid();
return RegisterKernelsFunc;		return RegisterKernelsFunc;
}		}

/// Creates a global constructor function for the module:		/// Creates a global constructor function for the module:
/// \code		/// \code
/// void __cuda_module_ctor(void*) {		/// void __cuda_module_ctor(void*) {
/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);		/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
/// __cuda_register_globals(Handle0);		/// __cuda_register_globals(Handle);
/// ...
/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
/// __cuda_register_globals(HandleN);
/// }		/// }
/// \endcode		/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {		llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// No need to generate ctors/dtors if there are no GPU binaries.		// No need to generate ctors/dtors if there is no GPU binary.
if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty())		std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
		if (GpuBinaryFileName.empty())
return nullptr;		return nullptr;

// void __cuda_register_globals(void* handle);		// void __cuda_register_globals(void* handle);
llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();		llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
// void ** __cudaRegisterFatBinary(void *);		// void ** __cudaRegisterFatBinary(void *);
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(		llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),		llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
"__cudaRegisterFatBinary");		"__cudaRegisterFatBinary");
// struct { int magic, int version, void * gpu_binary, void * dont_care };		// struct { int magic, int version, void * gpu_binary, void * dont_care };
llvm::StructType *FatbinWrapperTy =		llvm::StructType *FatbinWrapperTy =
llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);		llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);

		// Register GPU binary with the CUDA runtime, store returned handle in a
		// global variable and save a reference in GpuBinaryHandle to be cleaned up
		// in destructor on exit. Then associate all known kernels with the GPU binary
		// handle so CUDA runtime can figure out what to call on the GPU side.
		llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
		llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
		if (std::error_code EC = GpuBinaryOrErr.getError()) {
		CGM.getDiags().Report(diag::err_cannot_open_file)
		<< GpuBinaryFileName << EC.message();
		return nullptr;
		}

llvm::Function *ModuleCtorFunc = llvm::Function::Create(		llvm::Function *ModuleCtorFunc = llvm::Function::Create(
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),		llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);		llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
llvm::BasicBlock *CtorEntryBB =		llvm::BasicBlock *CtorEntryBB =
llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);		llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
CGBuilderTy CtorBuilder(CGM, Context);		CGBuilderTy CtorBuilder(CGM, Context);

CtorBuilder.SetInsertPoint(CtorEntryBB);		CtorBuilder.SetInsertPoint(CtorEntryBB);

// For each GPU binary, register it with the CUDA runtime and store returned
// handle in a global variable and save the handle in GpuBinaryHandles vector
// to be cleaned up in destructor on exit. Then associate all known kernels
// with the GPU binary handle so CUDA runtime can figure out what to call on
// the GPU side.
for (const std::string &GpuBinaryFileName :
CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
if (std::error_code EC = GpuBinaryOrErr.getError()) {
CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName
<< EC.message();
continue;
}

const char *FatbinConstantName =		const char *FatbinConstantName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";		CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
// NVIDIA's cuobjdump looks for fatbins in this section.		// NVIDIA's cuobjdump looks for fatbins in this section.
const char *FatbinSectionName =		const char *FatbinSectionName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";		CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";

// Create initialized wrapper structure that points to the loaded GPU binary		// Create initialized wrapper structure that points to the loaded GPU binary
ConstantInitBuilder Builder(CGM);		ConstantInitBuilder Builder(CGM);
auto Values = Builder.beginStruct(FatbinWrapperTy);		auto Values = Builder.beginStruct(FatbinWrapperTy);
// Fatbin wrapper magic.		// Fatbin wrapper magic.
Values.addInt(IntTy, 0x466243b1);		Values.addInt(IntTy, 0x466243b1);
// Fatbin version.		// Fatbin version.
Values.addInt(IntTy, 1);		Values.addInt(IntTy, 1);
// Data.		// Data.
Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(),		Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
"", FatbinConstantName, 8));		FatbinConstantName, 8));
// Unused in fatbin v1.		// Unused in fatbin v1.
Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));		Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
llvm::GlobalVariable *FatbinWrapper =		llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
Values.finishAndCreateGlobal("__cuda_fatbin_wrapper",		"__cuda_fatbin_wrapper", CGM.getPointerAlign(),
CGM.getPointerAlign(),
/constant/ true);		/constant/ true);
FatbinWrapper->setSection(FatbinSectionName);		FatbinWrapper->setSection(FatbinSectionName);

// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);		// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(		llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
RegisterFatbinFunc,		RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));		GpuBinaryHandle = new llvm::GlobalVariable(
llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable(
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,		TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");		llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,		CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
CGM.getPointerAlign());		CGM.getPointerAlign());

// Call __cuda_register_globals(GpuBinaryHandle);		// Call __cuda_register_globals(GpuBinaryHandle);
if (RegisterGlobalsFunc)		if (RegisterGlobalsFunc)
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);		CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);

// Save GpuBinaryHandle so we can unregister it in destructor.
GpuBinaryHandles.push_back(GpuBinaryHandle);
}

CtorBuilder.CreateRetVoid();		CtorBuilder.CreateRetVoid();
return ModuleCtorFunc;		return ModuleCtorFunc;
}		}

/// Creates a global destructor function that unregisters all GPU code blobs		/// Creates a global destructor function that unregisters the GPU code blob
/// registered by constructor.		/// registered by constructor.
/// \code		/// \code
/// void __cuda_module_dtor(void*) {		/// void __cuda_module_dtor(void*) {
/// __cudaUnregisterFatBinary(Handle0);		/// __cudaUnregisterFatBinary(Handle);
/// ...
/// __cudaUnregisterFatBinary(HandleN);
/// }		/// }
/// \endcode		/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {		llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
// No need for destructor if we don't have handles to unregister.		// No need for destructor if we don't have a handle to unregister.
if (GpuBinaryHandles.empty())		if (!GpuBinaryHandle)
return nullptr;		return nullptr;

// void __cudaUnregisterFatBinary(void ** handle);		// void __cudaUnregisterFatBinary(void ** handle);
llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(		llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),		llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
"__cudaUnregisterFatBinary");		"__cudaUnregisterFatBinary");

llvm::Function *ModuleDtorFunc = llvm::Function::Create(		llvm::Function *ModuleDtorFunc = llvm::Function::Create(
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),		llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule);		llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule);
llvm::BasicBlock *DtorEntryBB =		llvm::BasicBlock *DtorEntryBB =
llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);		llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
CGBuilderTy DtorBuilder(CGM, Context);		CGBuilderTy DtorBuilder(CGM, Context);
DtorBuilder.SetInsertPoint(DtorEntryBB);		DtorBuilder.SetInsertPoint(DtorEntryBB);

for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) {
auto HandleValue =		auto HandleValue =
DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());		DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);		DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
}

DtorBuilder.CreateRetVoid();		DtorBuilder.CreateRetVoid();
return ModuleDtorFunc;		return ModuleDtorFunc;
}		}

CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {		CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
return new CGNVCUDARuntime(CGM);		return new CGNVCUDARuntime(CGM);
}		}

lib/Driver/ToolChains/Clang.cpp

Show First 20 Lines • Show All 4,655 Lines • ▼ Show 20 Lines	for (const char *OriginalArg : OriginalArgs) {
Flags += " ";		Flags += " ";
Flags += EscapedArg;		Flags += EscapedArg;
}		}
CmdArgs.push_back("-dwarf-debug-flags");		CmdArgs.push_back("-dwarf-debug-flags");
CmdArgs.push_back(Args.MakeArgString(Flags));		CmdArgs.push_back(Args.MakeArgString(Flags));
}		}

if (IsCuda) {		if (IsCuda) {
// Host-side cuda compilation receives device-side outputs as Inputs[1...].		// Host-side cuda compilation receives all device-side outputs in a single
// Include them with -fcuda-include-gpubinary.		// fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
		traUnsubmitted Done Reply Inline Actions Nit: Passing multiple things as a single input may need some more details. E.g. `...receives all device-side outputs in a single fatbin as Inputs[1]` tra: Nit: Passing multiple things as a single input may need some more details. E.g. `...receives…
if (Inputs.size() > 1) {		if (Inputs.size() > 1) {
for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {		assert(Inputs.size() == 2 && "More than one GPU binary!");
CmdArgs.push_back("-fcuda-include-gpubinary");		CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(I->getFilename());		CmdArgs.push_back(Inputs[1].getFilename());
}
}		}

if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))		if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
CmdArgs.push_back("-fcuda-rdc");		CmdArgs.push_back("-fcuda-rdc");
}		}

// OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path		// OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path
// to specify the result of the compile phase on the host, so the meaningful		// to specify the result of the compile phase on the host, so the meaningful
▲ Show 20 Lines • Show All 914 Lines • Show Last 20 Lines

lib/Frontend/CompilerInvocation.cpp

Show First 20 Lines • Show All 1,039 Lines • ▼ Show 20 Lines	static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
// FIXME: Report unrecoverable sanitizers incorrectly specified here.		// FIXME: Report unrecoverable sanitizers incorrectly specified here.
parseSanitizerKinds("-fsanitize-recover=",		parseSanitizerKinds("-fsanitize-recover=",
Args.getAllArgValues(OPT_fsanitize_recover_EQ), Diags,		Args.getAllArgValues(OPT_fsanitize_recover_EQ), Diags,
Opts.SanitizeRecover);		Opts.SanitizeRecover);
parseSanitizerKinds("-fsanitize-trap=",		parseSanitizerKinds("-fsanitize-trap=",
Args.getAllArgValues(OPT_fsanitize_trap_EQ), Diags,		Args.getAllArgValues(OPT_fsanitize_trap_EQ), Diags,
Opts.SanitizeTrap);		Opts.SanitizeTrap);

Opts.CudaGpuBinaryFileNames =		Opts.CudaGpuBinaryFileName =
Args.getAllArgValues(OPT_fcuda_include_gpubinary);		Args.getLastArgValue(OPT_fcuda_include_gpubinary);

		traUnsubmitted Not Done Reply Inline Actions If more than one gpu binary is passed, all but last will be ignored. IMO in this case we would want to either warn that some inputs were ignored or report an error that there is more than one GPU binary. tra: If more than one gpu binary is passed, all but last will be ignored. IMO in this case we would…
		HahnfeldAuthorUnsubmitted Not Done Reply Inline Actions Well, `-fcuda-include-gpubinary` is only recognized on `cc1`. I think we can assume that we are correctly assembling our command line, can't we? (Nobody else checks the options here...) Hahnfeld: Well, `-fcuda-include-gpubinary` is only recognized on `cc1`. I think we can assume that we are…
		traUnsubmitted Not Done Reply Inline Actions Fair enough. Assert, then? tra: Fair enough. Assert, then?
		HahnfeldAuthorUnsubmitted Not Done Reply Inline Actions I added an `assert` in `lib/Driver/ToolChains/Clang.cpp` where we are constructing the command line. I think that guarantees that we are getting only a single argument. Hahnfeld: I added an `assert` in `lib/Driver/ToolChains/Clang.cpp` where we are constructing the command…
Opts.Backchain = Args.hasArg(OPT_mbackchain);		Opts.Backchain = Args.hasArg(OPT_mbackchain);

Opts.EmitCheckPathComponentsToStrip = getLastArgIntValue(		Opts.EmitCheckPathComponentsToStrip = getLastArgIntValue(
Args, OPT_fsanitize_undefined_strip_path_components_EQ, 0, Diags);		Args, OPT_fsanitize_undefined_strip_path_components_EQ, 0, Diags);

return Success;		return Success;
}		}

▲ Show 20 Lines • Show All 2,000 Lines • Show Last 20 Lines

test/Driver/cuda-options.cu

	Show First 20 Lines • Show All 67 Lines • ▼ Show 20 Lines
	// RUN: \| FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \			// RUN: \| FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
	// RUN: -check-prefix DEVICE-SM30 -check-prefix HOST \			// RUN: -check-prefix DEVICE-SM30 -check-prefix HOST \
	// RUN: -check-prefix INCLUDES-DEVICE -check-prefix NOLINK %s			// RUN: -check-prefix INCLUDES-DEVICE -check-prefix NOLINK %s

	// Verify that there is one device-side compilation per --cuda-gpu-arch args			// Verify that there is one device-side compilation per --cuda-gpu-arch args
	// and that all results are included on the host side.			// and that all results are included on the host side.
	// RUN: %clang -### -target x86_64-linux-gnu \			// RUN: %clang -### -target x86_64-linux-gnu \
	// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \			// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
	// RUN: \| FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \			// RUN: \| FileCheck -check-prefixes DEVICE,DEVICE-NOSAVE,DEVICE2 \
	// RUN: -check-prefix DEVICE2 -check-prefix DEVICE-SM30 \			// RUN: -check-prefixes DEVICE-SM30,DEVICE2-SM35 \
	// RUN: -check-prefix DEVICE2-SM35 -check-prefix HOST \			// RUN: -check-prefixes INCLUDES-DEVICE,INCLUDES-DEVICE2 \
	// RUN: -check-prefix HOST-NOSAVE -check-prefix INCLUDES-DEVICE \			// RUN: -check-prefixes HOST,HOST-NOSAVE,NOLINK %s
	// RUN: -check-prefix NOLINK %s

	// Verify that device-side results are passed to the correct tool when			// Verify that device-side results are passed to the correct tool when
	// -save-temps is used.			// -save-temps is used.
	// RUN: %clang -### -target x86_64-linux-gnu -save-temps -c %s 2>&1 \			// RUN: %clang -### -target x86_64-linux-gnu -save-temps -c %s 2>&1 \
	// RUN: \| FileCheck -check-prefix DEVICE -check-prefix DEVICE-SAVE \			// RUN: \| FileCheck -check-prefix DEVICE -check-prefix DEVICE-SAVE \
	// RUN: -check-prefix HOST -check-prefix HOST-SAVE -check-prefix NOLINK %s			// RUN: -check-prefix HOST -check-prefix HOST-SAVE -check-prefix NOLINK %s

	// Verify that device-side results are passed to the correct tool when			// Verify that device-side results are passed to the correct tool when
	▲ Show 20 Lines • Show All 88 Lines • ▼ Show 20 Lines
	// DEVICE-DAG: "--output-file" "[[CUBINFILE:[^"]*]]"			// DEVICE-DAG: "--output-file" "[[CUBINFILE:[^"]*]]"
	// DEVICE-DAG: "[[PTXFILE]]"			// DEVICE-DAG: "[[PTXFILE]]"

	// Match another device-side compilation.			// Match another device-side compilation.
	// DEVICE2: "-cc1" "-triple" "nvptx64-nvidia-cuda"			// DEVICE2: "-cc1" "-triple" "nvptx64-nvidia-cuda"
	// DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu"			// DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu"
	// DEVICE2-SAME: "-fcuda-is-device"			// DEVICE2-SAME: "-fcuda-is-device"
	// DEVICE2-SM35-SAME: "-target-cpu" "sm_35"			// DEVICE2-SM35-SAME: "-target-cpu" "sm_35"
	// DEVICE2-SAME: "-o" "[[GPUBINARY2:[^"]*]]"			// DEVICE2-SAME: "-o" "[[PTXFILE2:[^"]*]]"
	// DEVICE2-SAME: "-x" "cuda"			// DEVICE2-SAME: "-x" "cuda"

				// Match another call to ptxas.
				// DEVICE2: ptxas
				// DEVICE2-SM35-DAG: "--gpu-name" "sm_35"
				// DEVICE2-DAG: "--output-file" "[[CUBINFILE2:[^"]*]]"
				// DEVICE2-DAG: "[[PTXFILE2]]"

	// Match no device-side compilation.			// Match no device-side compilation.
	// NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"			// NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
	// NODEVICE-NOT: "-fcuda-is-device"			// NODEVICE-NOT: "-fcuda-is-device"

	// INCLUDES-DEVICE:fatbinary			// INCLUDES-DEVICE:fatbinary
	// INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]"			// INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]"
	// INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]"			// INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]"
	// INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]"			// INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]"
				// INCLUDES-DEVICE2-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE2]]"
				// INCLUDES-DEVICE2-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE2]]"

	// Match host-side preprocessor job with -save-temps.			// Match host-side preprocessor job with -save-temps.
	// HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu"			// HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu"
	// HOST-SAVE-SAME: "-aux-triple" "nvptx64-nvidia-cuda"			// HOST-SAVE-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
	// HOST-SAVE-NOT: "-fcuda-is-device"			// HOST-SAVE-NOT: "-fcuda-is-device"
	// HOST-SAVE-SAME: "-x" "cuda"			// HOST-SAVE-SAME: "-x" "cuda"

	// Match host-side compilation.			// Match host-side compilation.
	// HOST: "-cc1" "-triple" "x86_64--linux-gnu"			// HOST: "-cc1" "-triple" "x86_64--linux-gnu"
	// HOST-SAME: "-aux-triple" "nvptx64-nvidia-cuda"			// HOST-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
	// HOST-NOT: "-fcuda-is-device"			// HOST-NOT: "-fcuda-is-device"
	// HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"			// HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
	// HOST-NOSAVE-SAME: "-x" "cuda"			// HOST-NOSAVE-SAME: "-x" "cuda"
	// HOST-SAVE-SAME: "-x" "cuda-cpp-output"			// HOST-SAVE-SAME: "-x" "cuda-cpp-output"
				// There is only one GPU binary after combining it with fatbinary!
				// INCLUDES-DEVICE2-NOT: "-fcuda-include-gpubinary"
	// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"			// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
				// There is only one GPU binary after combining it with fatbinary.
				// INCLUDES-DEVICE2-NOT: "-fcuda-include-gpubinary"

	// Match external assembler that uses compilation output.			// Match external assembler that uses compilation output.
	// HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"			// HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"

	// Match no GPU code inclusion.			// Match no GPU code inclusion.
	// NOINCLUDES-DEVICE-NOT: "-fcuda-include-gpubinary"			// NOINCLUDES-DEVICE-NOT: "-fcuda-include-gpubinary"

	// Match no host compilation.			// Match no host compilation.
	Show All 9 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[CUDA] Include single GPU binary, NFCI.ClosedPublic

Details

Diff Detail

Event Timeline