Diff 143302

lib/CodeGen/CGCUDANV.cpp

Show All 9 Lines
// This provides a class for CUDA code generation targeting the NVIDIA CUDA		// This provides a class for CUDA code generation targeting the NVIDIA CUDA
// runtime library.		// runtime library.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "CGCUDARuntime.h"		#include "CGCUDARuntime.h"
#include "CodeGenFunction.h"		#include "CodeGenFunction.h"
#include "CodeGenModule.h"		#include "CodeGenModule.h"
#include "clang/CodeGen/ConstantInitBuilder.h"
#include "clang/AST/Decl.h"		#include "clang/AST/Decl.h"
		#include "clang/CodeGen/ConstantInitBuilder.h"
#include "llvm/IR/BasicBlock.h"		#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"		#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"		#include "llvm/IR/DerivedTypes.h"
		#include "llvm/Support/Format.h"

using namespace clang;		using namespace clang;
using namespace CodeGen;		using namespace CodeGen;

namespace {		namespace {

class CGNVCUDARuntime : public CGCUDARuntime {		class CGNVCUDARuntime : public CGCUDARuntime {

private:		private:
llvm::IntegerType IntTy, SizeTy;		llvm::IntegerType IntTy, SizeTy;
llvm::Type *VoidTy;		llvm::Type *VoidTy;
llvm::PointerType CharPtrTy, VoidPtrTy, *VoidPtrPtrTy;		llvm::PointerType CharPtrTy, VoidPtrTy, *VoidPtrPtrTy;

/// Convenience reference to LLVM Context		/// Convenience reference to LLVM Context
llvm::LLVMContext &Context;		llvm::LLVMContext &Context;
/// Convenience reference to the current module		/// Convenience reference to the current module
llvm::Module &TheModule;		llvm::Module &TheModule;
/// Keeps track of kernel launch stubs emitted in this module		/// Keeps track of kernel launch stubs emitted in this module
llvm::SmallVector<llvm::Function *, 16> EmittedKernels;		llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;		llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
/// Keeps track of variable containing handle of GPU binary. Populated by		/// Keeps track of variable containing handle of GPU binary. Populated by
/// ModuleCtorFunction() and used to create corresponding cleanup calls in		/// ModuleCtorFunction() and used to create corresponding cleanup calls in
/// ModuleDtorFunction()		/// ModuleDtorFunction()
llvm::GlobalVariable *GpuBinaryHandle = nullptr;		llvm::GlobalVariable *GpuBinaryHandle = nullptr;
		/// Whether we generate relocatable device code.
		bool RelocatableDeviceCode;

llvm::Constant *getSetupArgumentFn() const;		llvm::Constant *getSetupArgumentFn() const;
llvm::Constant *getLaunchFn() const;		llvm::Constant *getLaunchFn() const;

		llvm::FunctionType *getRegisterGlobalsFnTy() const;
		llvm::FunctionType *getCallbackFnTy() const;
		llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;

/// Creates a function to register all kernel stubs generated in this module.		/// Creates a function to register all kernel stubs generated in this module.
llvm::Function *makeRegisterGlobalsFn();		llvm::Function *makeRegisterGlobalsFn();

/// Helper function that generates a constant string and returns a pointer to		/// Helper function that generates a constant string and returns a pointer to
/// the start of the string. The result of this function can be used anywhere		/// the start of the string. The result of this function can be used anywhere
/// where the C code specifies const char*.		/// where the C code specifies const char*.
llvm::Constant *makeConstantString(const std::string &Str,		llvm::Constant *makeConstantString(const std::string &Str,
const std::string &Name = "",		const std::string &Name = "",
const std::string &SectionName = "",		const std::string &SectionName = "",
unsigned Alignment = 0) {		unsigned Alignment = 0) {
llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),		llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
llvm::ConstantInt::get(SizeTy, 0)};		llvm::ConstantInt::get(SizeTy, 0)};
auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());		auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
llvm::GlobalVariable *GV =		llvm::GlobalVariable *GV =
cast<llvm::GlobalVariable>(ConstStr.getPointer());		cast<llvm::GlobalVariable>(ConstStr.getPointer());
if (!SectionName.empty())		if (!SectionName.empty())
GV->setSection(SectionName);		GV->setSection(SectionName);
if (Alignment)		if (Alignment)
GV->setAlignment(Alignment);		GV->setAlignment(Alignment);

return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),		return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
ConstStr.getPointer(), Zeros);		ConstStr.getPointer(), Zeros);
}		}

		/// Helper function that generates an empty dummy function returning void.
		llvm::Function makeDummyFunction(llvm::FunctionType FnTy) {
		assert(FnTy->getReturnType()->isVoidTy() &&
		"Can only generate dummy functions returning void!");
		llvm::Function *DummyFunc = llvm::Function::Create(
		FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);

		llvm::BasicBlock *DummyBlock =
		llvm::BasicBlock::Create(Context, "", DummyFunc);
		CGBuilderTy FuncBuilder(CGM, Context);
		FuncBuilder.SetInsertPoint(DummyBlock);
		FuncBuilder.CreateRetVoid();

		return DummyFunc;
		}

void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);		void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);

public:		public:
CGNVCUDARuntime(CodeGenModule &CGM);		CGNVCUDARuntime(CodeGenModule &CGM);

void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;		void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {		void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {
DeviceVars.push_back(std::make_pair(&Var, Flags));		DeviceVars.push_back(std::make_pair(&Var, Flags));
}		}

/// Creates module constructor function		/// Creates module constructor function
llvm::Function *makeModuleCtorFunction() override;		llvm::Function *makeModuleCtorFunction() override;
/// Creates module destructor function		/// Creates module destructor function
llvm::Function *makeModuleDtorFunction() override;		llvm::Function *makeModuleDtorFunction() override;
};		};

}		}

CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)		CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),		: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
TheModule(CGM.getModule()) {		TheModule(CGM.getModule()),
		RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) {
CodeGen::CodeGenTypes &Types = CGM.getTypes();		CodeGen::CodeGenTypes &Types = CGM.getTypes();
ASTContext &Ctx = CGM.getContext();		ASTContext &Ctx = CGM.getContext();

IntTy = CGM.IntTy;		IntTy = CGM.IntTy;
SizeTy = CGM.SizeTy;		SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;		VoidTy = CGM.VoidTy;

CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));		CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
Show All 10 Lines
}		}

llvm::Constant *CGNVCUDARuntime::getLaunchFn() const {		llvm::Constant *CGNVCUDARuntime::getLaunchFn() const {
// cudaError_t cudaLaunch(char *)		// cudaError_t cudaLaunch(char *)
return CGM.CreateRuntimeFunction(		return CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");		llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
}		}

		llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
		return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
		}

		llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
		return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
		}

		llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
		auto CallbackFnTy = getCallbackFnTy();
		auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
		llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
		VoidPtrTy, CallbackFnTy->getPointerTo()};
		return llvm::FunctionType::get(VoidTy, Params, false);
		}

void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,		void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
FunctionArgList &Args) {		FunctionArgList &Args) {
EmittedKernels.push_back(CGF.CurFn);		EmittedKernels.push_back(CGF.CurFn);
emitDeviceStubBody(CGF, Args);		emitDeviceStubBody(CGF, Args);
}		}

void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,		void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
FunctionArgList &Args) {		FunctionArgList &Args) {
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines
/// }		/// }
/// \endcode		/// \endcode
llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {		llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
// No need to register anything		// No need to register anything
if (EmittedKernels.empty() && DeviceVars.empty())		if (EmittedKernels.empty() && DeviceVars.empty())
return nullptr;		return nullptr;

llvm::Function *RegisterKernelsFunc = llvm::Function::Create(		llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),		getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule);		"__cuda_register_globals", &TheModule);
llvm::BasicBlock *EntryBB =		llvm::BasicBlock *EntryBB =
llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);		llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
CGBuilderTy Builder(CGM, Context);		CGBuilderTy Builder(CGM, Context);
Builder.SetInsertPoint(EntryBB);		Builder.SetInsertPoint(EntryBB);

// void __cudaRegisterFunction(void *, const char , char , const char ,		// void __cudaRegisterFunction(void *, const char , char , const char ,
// int, uint3, uint3, dim3, dim3, int*)		// int, uint3, uint3, dim3, dim3, int*)
llvm::Type *RegisterFuncParams[] = {		llvm::Type *RegisterFuncParams[] = {
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines
/// }		/// }
/// \endcode		/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {		llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// No need to generate ctors/dtors if there is no GPU binary.		// No need to generate ctors/dtors if there is no GPU binary.
std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;		std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
if (GpuBinaryFileName.empty())		if (GpuBinaryFileName.empty())
return nullptr;		return nullptr;

// void __cuda_register_globals(void* handle);		// void __cuda_register_globals(void* handle);
llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();		llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
		// We always need a function to pass in as callback. Create a dummy
		traUnsubmitted Done Reply Inline Actions Instead of tracking these through the conditionals of pretty long function, could we make these pointers class fields and init them in the constructor and make accessors return them and, possibly, assert that they are used if RDC is enabled? tra: Instead of tracking these through the conditionals of pretty long function, could we make these…
		HahnfeldAuthorUnsubmitted Not Done Reply Inline Actions I've removed the caching entirely because that's already done by `llvm::FunctionType::get()`. These are now called in new methods to avoid duplication. Hahnfeld: I've removed the caching entirely because that's already done by `llvm::FunctionType::get()`.
		// implementation if we don't need to register anything.
		if (RelocatableDeviceCode && !RegisterGlobalsFunc)
		RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());

// void ** __cudaRegisterFatBinary(void *);		// void ** __cudaRegisterFatBinary(void *);
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(		llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),		llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
"__cudaRegisterFatBinary");		"__cudaRegisterFatBinary");
// struct { int magic, int version, void * gpu_binary, void * dont_care };		// struct { int magic, int version, void * gpu_binary, void * dont_care };
llvm::StructType *FatbinWrapperTy =		llvm::StructType *FatbinWrapperTy =
llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);		llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);

Show All 12 Lines	llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
llvm::Function *ModuleCtorFunc = llvm::Function::Create(		llvm::Function *ModuleCtorFunc = llvm::Function::Create(
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),		llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);		llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
llvm::BasicBlock *CtorEntryBB =		llvm::BasicBlock *CtorEntryBB =
llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);		llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
CGBuilderTy CtorBuilder(CGM, Context);		CGBuilderTy CtorBuilder(CGM, Context);

CtorBuilder.SetInsertPoint(CtorEntryBB);		CtorBuilder.SetInsertPoint(CtorEntryBB);

const char *FatbinConstantName =		const char *FatbinConstantName;
		HahnfeldAuthorUnsubmitted Done Reply Inline Actions Can we actually have multiple GPU binaries here? If yes, how do I get there? Hahnfeld: Can we actually have multiple GPU binaries here? If yes, how do I get there?
		traUnsubmitted Not Done Reply Inline Actions Yes. `clang --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_50...` will compile for sm_35 and sm_50 and then will pass the names of GPU-side objects to the host compilation via `-fcuda-include-gpubinary`. tra: Yes. `clang --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_50...` will compile for sm_35 and sm_50…
		HahnfeldAuthorUnsubmitted Not Done Reply Inline Actions I'm not sure if that's true anymore: I think they are now combined by `fatbinary`. This seems to be confirmed by `test/Driver/cuda-options.cu`. If that was the only use case, we may try to get rid of this possibility, let me check this. Hahnfeld: I'm not sure if that's true anymore: I think they are now combined by `fatbinary`. This seems…
		traUnsubmitted Not Done Reply Inline Actions You are correct. All GPU binaries are in the single fatbin now. That said, you could still pass extra -fcuda-include-gpubinary to cc1 manually, but I see no practical reason to do it -- single fatbin serves the purpose better. We should remove this loop and make CGM.getCodeGenOpts().CudaGpuBinaryFileNames a scalar. tra: You are correct. All GPU binaries are in the single fatbin now. That said, you could still pass…
		HahnfeldAuthorUnsubmitted Not Done Reply Inline Actions Ok, I'll work on this as a preparation patch and rebase this on top. That actually explains why my changes have always been working even though it didn't handle the loop correctly :-) Hahnfeld: Ok, I'll work on this as a preparation patch and rebase this on top. That actually explains why…
		if (RelocatableDeviceCode)
		HahnfeldAuthorUnsubmitted Not Done Reply Inline Actions @jlebar The same here, probably `__NV_CUDA,__nv_module_id`? Hahnfeld: @jlebar The same here, probably `__NV_CUDA,__nv_module_id`?
		// TODO: Figure out how this is called on mac OS!
		HahnfeldAuthorUnsubmitted Not Done Reply Inline Actions @jlebar Could yo help me here as I don't have a Mac? I'd guess it's `__NV_CUDA,__nv_relfatbin` but I'd feel better if I can get a confirmation... Hahnfeld: @jlebar Could yo help me here as I don't have a Mac? I'd guess it's `__NV_CUDA,__nv_relfatbin`…
		FatbinConstantName = "__nv_relfatbin";
		else
		FatbinConstantName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";		CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
// NVIDIA's cuobjdump looks for fatbins in this section.		// NVIDIA's cuobjdump looks for fatbins in this section.
const char *FatbinSectionName =		const char *FatbinSectionName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";		CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
		// TODO: Figure out how this is called on mac OS!
		const char *NVModuleIDSectionName = "__nv_module_id";

// Create initialized wrapper structure that points to the loaded GPU binary		// Create initialized wrapper structure that points to the loaded GPU binary
ConstantInitBuilder Builder(CGM);		ConstantInitBuilder Builder(CGM);
auto Values = Builder.beginStruct(FatbinWrapperTy);		auto Values = Builder.beginStruct(FatbinWrapperTy);
// Fatbin wrapper magic.		// Fatbin wrapper magic.
Values.addInt(IntTy, 0x466243b1);		Values.addInt(IntTy, 0x466243b1);
// Fatbin version.		// Fatbin version.
Values.addInt(IntTy, 1);		Values.addInt(IntTy, 1);
// Data.		// Data.
Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",		Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
FatbinConstantName, 8));		FatbinConstantName, 8));
// Unused in fatbin v1.		// Unused in fatbin v1.
Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));		Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(		llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
"__cuda_fatbin_wrapper", CGM.getPointerAlign(),		"__cuda_fatbin_wrapper", CGM.getPointerAlign(),
/constant/ true);		/constant/ true);
FatbinWrapper->setSection(FatbinSectionName);		FatbinWrapper->setSection(FatbinSectionName);

		// Register binary with CUDA runtime. This is substantially different in
		// default mode vs. separate compilation!
		if (!RelocatableDeviceCode) {
// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);		// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(		llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));		RegisterFatbinFunc,
		CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
GpuBinaryHandle = new llvm::GlobalVariable(		GpuBinaryHandle = new llvm::GlobalVariable(
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,		TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");		llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,		CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
CGM.getPointerAlign());		CGM.getPointerAlign());

// Call __cuda_register_globals(GpuBinaryHandle);		// Call __cuda_register_globals(GpuBinaryHandle);
if (RegisterGlobalsFunc)		if (RegisterGlobalsFunc)
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);		CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
		} else {
		// Generate a unique module ID.
		SmallString<64> NVModuleID;
		llvm::raw_svector_ostream OS(NVModuleID);
		OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID());
		llvm::Constant *NVModuleIDConstant =
		makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32);

		// Create an alias for the FatbinWrapper that nvcc will look for.
		llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
		Twine("__fatbinwrap") + NVModuleID,
		FatbinWrapper);

		// void __cudaRegisterLinkedBinary%NVModuleID%(void ()(void ), void *,
		// void , void ()(void **))
		SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
		RegisterLinkedBinaryName += NVModuleID;
		llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
		getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);

		assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
		llvm::Value *Args[] = {RegisterGlobalsFunc,
		CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
		NVModuleIDConstant,
		makeDummyFunction(getCallbackFnTy())};
		CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
		}

		traUnsubmitted Done Reply Inline Actions This can all be folded into the 'else' branch of the 'if' below. tra: This can all be folded into the 'else' branch of the 'if' below.
CtorBuilder.CreateRetVoid();		CtorBuilder.CreateRetVoid();
return ModuleCtorFunc;		return ModuleCtorFunc;
}		}

/// Creates a global destructor function that unregisters the GPU code blob		/// Creates a global destructor function that unregisters the GPU code blob
/// registered by constructor.		/// registered by constructor.
/// \code		/// \code
/// void __cuda_module_dtor(void*) {		/// void __cuda_module_dtor(void*) {
Show All 32 Lines

test/CodeGenCUDA/device-stub.cu

	// RUN: echo "GPU binary would be here" > %t			// RUN: echo "GPU binary would be here" > %t
	// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - \| FileCheck %s			// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
	// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - -DNOGLOBALS \			// RUN: -fcuda-include-gpubinary %t -o - \
				// RUN: \| FileCheck %s --check-prefixes=ALL,NORDC
				// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
				// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS \
				traUnsubmitted Done Reply Inline Actions Labels could be a bit more descriptive: CHECK -> ALL DEFAULT -> NORDC Long RUN lines could use some re-wrapping. tra: Labels could be a bit more descriptive: CHECK -> ALL DEFAULT -> NORDC Long RUN lines could use…
	// RUN: \| FileCheck %s -check-prefix=NOGLOBALS			// RUN: \| FileCheck %s -check-prefix=NOGLOBALS
	// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \| FileCheck %s -check-prefix=NOGPUBIN			// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
				// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - \
				// RUN: \| FileCheck %s --check-prefixes=ALL,RDC
				// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
				// RUN: \| FileCheck %s -check-prefix=NOGPUBIN

	#include "Inputs/cuda.h"			#include "Inputs/cuda.h"

	#ifndef NOGLOBALS			#ifndef NOGLOBALS
	// CHECK-DAG: @device_var = internal global i32			// ALL-DAG: @device_var = internal global i32
	__device__ int device_var;			__device__ int device_var;

	// CHECK-DAG: @constant_var = internal global i32			// ALL-DAG: @constant_var = internal global i32
	__constant__ int constant_var;			__constant__ int constant_var;

	// CHECK-DAG: @shared_var = internal global i32			// ALL-DAG: @shared_var = internal global i32
	__shared__ int shared_var;			__shared__ int shared_var;

	// Make sure host globals don't get internalized...			// Make sure host globals don't get internalized...
	// CHECK-DAG: @host_var = global i32			// ALL-DAG: @host_var = global i32
	int host_var;			int host_var;
	// ... and that extern vars remain external.			// ... and that extern vars remain external.
	// CHECK-DAG: @ext_host_var = external global i32			// ALL-DAG: @ext_host_var = external global i32
	extern int ext_host_var;			extern int ext_host_var;

	// Shadows for external device-side variables are definitions of			// Shadows for external device-side variables are definitions of
	// those variables.			// those variables.
	// CHECK-DAG: @ext_device_var = internal global i32			// ALL-DAG: @ext_device_var = internal global i32
	extern __device__ int ext_device_var;			extern __device__ int ext_device_var;
	// CHECK-DAG: @ext_device_var = internal global i32			// ALL-DAG: @ext_device_var = internal global i32
	extern __constant__ int ext_constant_var;			extern __constant__ int ext_constant_var;

	void use_pointers() {			void use_pointers() {
	int *p;			int *p;
	p = &device_var;			p = &device_var;
	p = &constant_var;			p = &constant_var;
	p = &shared_var;			p = &shared_var;
	p = &host_var;			p = &host_var;
	p = &ext_device_var;			p = &ext_device_var;
	p = &ext_constant_var;			p = &ext_constant_var;
	p = &ext_host_var;			p = &ext_host_var;
	}			}

	// Make sure that all parts of GPU code init/cleanup are there:			// Make sure that all parts of GPU code init/cleanup are there:
	// * constant unnamed string with the kernel name			// * constant unnamed string with the kernel name
	// CHECK: private unnamed_addr constant{{.}}kernelfunc{{.}}\00"			// ALL: private unnamed_addr constant{{.}}kernelfunc{{.}}\00"
	// * constant unnamed string with GPU binary			// * constant unnamed string with GPU binary
	// CHECK: private unnamed_addr constant{{.GPU binary would be here.}}\00"			// ALL: private unnamed_addr constant{{.GPU binary would be here.}}\00"
	// CHECK-SAME: section ".nv_fatbin", align 8			// NORDC-SAME: section ".nv_fatbin", align 8
				// RDC-SAME: section "__nv_relfatbin", align 8
	// * constant struct that wraps GPU binary			// * constant struct that wraps GPU binary
	// CHECK: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8, i8 }			// ALL: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8, i8 }
	// CHECK-SAME: { i32 1180844977, i32 1, {{.}}, i8 null }			// ALL-SAME: { i32 1180844977, i32 1, {{.}}, i8 null }
	// CHECK-SAME: section ".nvFatBinSegment"			// ALL-SAME: section ".nvFatBinSegment"
	// * variable to save GPU binary handle after initialization			// * variable to save GPU binary handle after initialization
	// CHECK: @__cuda_gpubin_handle = internal global i8** null			// NORDC: @__cuda_gpubin_handle = internal global i8** null
	// * Make sure our constructor/destructor was added to global ctor/dtor list.			// * constant unnamed string with NVModuleID
	// CHECK: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor			// RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant
	// CHECK: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor			// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
				// * Make sure our constructor was added to global ctor list.
				// ALL: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
				// * In separate mode we also register a destructor.
				// NORDC: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
				// * Alias to global symbol containing the NVModuleID.
				// RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8, i8 }
				// RDC-SAME: { i32, i32, i8, i8 }* @__cuda_fatbin_wrapper

	// Test that we build the correct number of calls to cudaSetupArgument followed			// Test that we build the correct number of calls to cudaSetupArgument followed
	// by a call to cudaLaunch.			// by a call to cudaLaunch.

	// CHECK: define{{.*}}kernelfunc			// ALL: define{{.*}}kernelfunc
	// CHECK: call{{.*}}cudaSetupArgument			// ALL: call{{.*}}cudaSetupArgument
	// CHECK: call{{.*}}cudaSetupArgument			// ALL: call{{.*}}cudaSetupArgument
	// CHECK: call{{.*}}cudaSetupArgument			// ALL: call{{.*}}cudaSetupArgument
	// CHECK: call{{.*}}cudaLaunch			// ALL: call{{.*}}cudaLaunch
	__global__ void kernelfunc(int i, int j, int k) {}			__global__ void kernelfunc(int i, int j, int k) {}

	// Test that we've built correct kernel launch sequence.			// Test that we've built correct kernel launch sequence.
	// CHECK: define{{.*}}hostfunc			// ALL: define{{.*}}hostfunc
	// CHECK: call{{.*}}cudaConfigureCall			// ALL: call{{.*}}cudaConfigureCall
	// CHECK: call{{.*}}kernelfunc			// ALL: call{{.*}}kernelfunc
	void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }			void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
	#endif			#endif

	// Test that we've built a function to register kernels and global vars.			// Test that we've built a function to register kernels and global vars.
	// CHECK: define internal void @__cuda_register_globals			// ALL: define internal void @__cuda_register_globals
	// CHECK: call{{.}}cudaRegisterFunction(i8* %0, {{.*}}kernelfunc			// ALL: call{{.}}cudaRegisterFunction(i8* %0, {{.*}}kernelfunc
	// CHECK-DAG: call{{.}}cudaRegisterVar(i8* %0, {{.}}device_var{{.}}i32 0, i32 4, i32 0, i32 0			// ALL-DAG: call{{.}}cudaRegisterVar(i8* %0, {{.}}device_var{{.}}i32 0, i32 4, i32 0, i32 0
	// CHECK-DAG: call{{.}}cudaRegisterVar(i8* %0, {{.}}constant_var{{.}}i32 0, i32 4, i32 1, i32 0			// ALL-DAG: call{{.}}cudaRegisterVar(i8* %0, {{.}}constant_var{{.}}i32 0, i32 4, i32 1, i32 0
	// CHECK-DAG: call{{.}}cudaRegisterVar(i8* %0, {{.}}ext_device_var{{.}}i32 1, i32 4, i32 0, i32 0			// ALL-DAG: call{{.}}cudaRegisterVar(i8* %0, {{.}}ext_device_var{{.}}i32 1, i32 4, i32 0, i32 0
	// CHECK-DAG: call{{.}}cudaRegisterVar(i8* %0, {{.}}ext_constant_var{{.}}i32 1, i32 4, i32 1, i32 0			// ALL-DAG: call{{.}}cudaRegisterVar(i8* %0, {{.}}ext_constant_var{{.}}i32 1, i32 4, i32 1, i32 0
	// CHECK: ret void			// ALL: ret void

	// Test that we've built constructor..			// Test that we've built a constructor.
	// CHECK: define internal void @__cuda_module_ctor			// ALL: define internal void @__cuda_module_ctor
	// .. that calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper)
	// CHECK: call{{.}}cudaRegisterFatBinary{{.}}__cuda_fatbin_wrapper			// In separate mode it calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper)
				// NORDC: call{{.}}cudaRegisterFatBinary{{.}}__cuda_fatbin_wrapper
	// .. stores return value in __cuda_gpubin_handle			// .. stores return value in __cuda_gpubin_handle
	// CHECK-NEXT: store{{.*}}__cuda_gpubin_handle			// NORDC-NEXT: store{{.*}}__cuda_gpubin_handle
	// .. and then calls __cuda_register_globals			// .. and then calls __cuda_register_globals
	// CHECK-NEXT: call void @__cuda_register_globals			// NORDC-NEXT: call void @__cuda_register_globals

				// With relocatable device code we call __cudaRegisterLinkedBinary%NVModuleID%
				// RDC: call{{.*}}__cudaRegisterLinkedBinary[[MODULE_ID]](
				// RDC-SAME: __cuda_register_globals, {{.*}}__cuda_fatbin_wrapper
				// RDC-SAME: [[MODULE_ID_GLOBAL]]

	// Test that we've created destructor.			// Test that we've created destructor.
	// CHECK: define internal void @__cuda_module_dtor			// NORDC: define internal void @__cuda_module_dtor
	// CHECK: load{{.*}}__cuda_gpubin_handle			// NORDC: load{{.*}}__cuda_gpubin_handle
	// CHECK-NEXT: call void @__cudaUnregisterFatBinary			// NORDC-NEXT: call void @__cudaUnregisterFatBinary

	// There should be no __cuda_register_globals if we have no			// There should be no __cuda_register_globals if we have no
	// device-side globals, but we still need to register GPU binary.			// device-side globals, but we still need to register GPU binary.
	// Skip GPU binary string first.			// Skip GPU binary string first.
	// NOGLOBALS: @0 = private unnamed_addr constant{{.*}}			// NOGLOBALS: @0 = private unnamed_addr constant{{.*}}
	// NOGLOBALS-NOT: define internal void @__cuda_register_globals			// NOGLOBALS-NOT: define internal void @__cuda_register_globals
	// NOGLOBALS: define internal void @__cuda_module_ctor			// NOGLOBALS: define internal void @__cuda_module_ctor
	// NOGLOBALS: call{{.}}cudaRegisterFatBinary{{.}}__cuda_fatbin_wrapper			// NOGLOBALS: call{{.}}cudaRegisterFatBinary{{.}}__cuda_fatbin_wrapper
	// NOGLOBALS-NOT: call void @__cuda_register_globals			// NOGLOBALS-NOT: call void @__cuda_register_globals
	// NOGLOBALS: define internal void @__cuda_module_dtor			// NOGLOBALS: define internal void @__cuda_module_dtor
	// NOGLOBALS: call void @__cudaUnregisterFatBinary			// NOGLOBALS: call void @__cudaUnregisterFatBinary

	// There should be no constructors/destructors if we have no GPU binary.			// There should be no constructors/destructors if we have no GPU binary.
	// NOGPUBIN-NOT: define internal void @__cuda_register_globals			// NOGPUBIN-NOT: define internal void @__cuda_register_globals
	// NOGPUBIN-NOT: define internal void @__cuda_module_ctor			// NOGPUBIN-NOT: define internal void @__cuda_module_ctor
	// NOGPUBIN-NOT: define internal void @__cuda_module_dtor			// NOGPUBIN-NOT: define internal void @__cuda_module_dtor

This is an archive of the discontinued LLVM Phabricator instance.

[CUDA] Register relocatable GPU binaries
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 143302

lib/CodeGen/CGCUDANV.cpp

test/CodeGenCUDA/device-stub.cu

This is an archive of the discontinued LLVM Phabricator instance.

[CUDA] Register relocatable GPU binariesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 143302

lib/CodeGen/CGCUDANV.cpp

test/CodeGenCUDA/device-stub.cu

[CUDA] Register relocatable GPU binaries
ClosedPublic