Diff 156626

lib/CodeGen/CGCUDANV.cpp

Show First 20 Lines • Show All 303 Lines • ▼ Show 20 Lines	for (auto &Pair : DeviceVars) {
Builder.CreateCall(RegisterVar, Args);		Builder.CreateCall(RegisterVar, Args);
}		}

Builder.CreateRetVoid();		Builder.CreateRetVoid();
return RegisterKernelsFunc;		return RegisterKernelsFunc;
}		}

/// Creates a global constructor function for the module:		/// Creates a global constructor function for the module:
		///
		/// For CUDA:
/// \code		/// \code
/// void __cuda_module_ctor(void*) {		/// void __cuda_module_ctor(void*) {
/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);		/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
/// __cuda_register_globals(Handle);		/// __cuda_register_globals(Handle);
/// }		/// }
/// \endcode		/// \endcode
		///
		/// For HIP:
		/// \code
		/// void __hip_module_ctor(void*) {
		/// if (__hip_gpubin_handle == 0) {
		/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
		/// __hip_register_globals(__hip_gpubin_handle);
		/// }
		/// }
		/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {		llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
bool IsHIP = CGM.getLangOpts().HIP;		bool IsHIP = CGM.getLangOpts().HIP;
// No need to generate ctors/dtors if there is no GPU binary.		// No need to generate ctors/dtors if there is no GPU binary.
StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;		StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
if (CudaGpuBinaryFileName.empty() && !IsHIP)		if (CudaGpuBinaryFileName.empty() && !IsHIP)
return nullptr;		return nullptr;

// void __{cuda\|hip}_register_globals(void* handle);		// void __{cuda\|hip}_register_globals(void* handle);
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
Values.add(FatBinStr);		Values.add(FatBinStr);
// Unused in fatbin v1.		// Unused in fatbin v1.
Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));		Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(		llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),		addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
/constant/ true);		/constant/ true);
FatbinWrapper->setSection(FatbinSectionName);		FatbinWrapper->setSection(FatbinSectionName);

// Register binary with CUDA/HIP runtime. This is substantially different in		// There is only one HIP fat binary per linked module, however there are
		// multiple constructor functions. Make sure the fat binary is registered
		// only once. The constructor functions are executed by the dynamic loader
		// before the program gains control. The dynamic loader cannot execute the
		// constructor functions concurrently since doing that would not guarantee
		// thread safety of the loaded program. Therefore we can assume sequential
		// execution of constructor functions here.
		if (IsHIP) {
		llvm::BasicBlock *IfBlock =
		llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
		llvm::BasicBlock *ExitBlock =
		llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
		// The name, size, and initialization pattern of this variable is part
		// of HIP ABI.
		GpuBinaryHandle = new llvm::GlobalVariable(
		rjmccallUnsubmitted Done Reply Inline Actions Do you not need to worry about concurrency here? rjmccall: Do you not need to worry about concurrency here?
		yaxunlAuthorUnsubmitted Done Reply Inline Actions The ctor functions are executed by the dynamic loader before the program gains the control. The dynamic loader cannot excute the ctor functions concurrently since doing that would not gurantee thread safety of the loaded program. Therefore we can assume sequential execution of ctor functions here. yaxunl: The ctor functions are executed by the dynamic loader before the program gains the control. The…
		rjmccallUnsubmitted Done Reply Inline Actions Okay. That's worth a comment. Is the name here specified by some ABI document, or is it just a conventional name that we're picking now? rjmccall: Okay. That's worth a comment. Is the name here specified by some ABI document, or is it just…
		yaxunlAuthorUnsubmitted Done Reply Inline Actions Will add a comment for that. You mean `__hip_gpubin_handle`? It is an implementation detail. It is not defined by ABI or other documentation. Since it is only used internally by ctor functions, it is not a visible elf symbol. Its name is by convention since the cuda corresponding one was named __cuda_gpubin_handle. yaxunl: Will add a comment for that. You mean `__hip_gpubin_handle`? It is an implementation detail.
		rjmccallUnsubmitted Done Reply Inline Actions Well, it is ABI, right? It's necessary for all translation units to agree to use the same symbol here or else the registration will happen multiple times. rjmccall: Well, it is ABI, right? It's necessary for all translation units to agree to use the same…
		yaxunlAuthorUnsubmitted Done Reply Inline Actions Right. I created a pull request for HIP to document this https://github.com/ROCm-Developer-Tools/HIP/pull/580/files yaxunl: Right. I created a pull request for HIP to document this https://github.com/ROCm-Developer…
		rjmccallUnsubmitted Done Reply Inline Actions Okay. Please leave a comment here explaining that this variable's name, size, and initialization pattern are part of the HIP ABI, then. rjmccall: Okay. Please leave a comment here explaining that this variable's name, size, and…
		yaxunlAuthorUnsubmitted Not Done Reply Inline Actions Will do yaxunl: Will do
		TheModule, VoidPtrPtrTy, /isConstant=/false,
		llvm::GlobalValue::LinkOnceAnyLinkage,
		/Initializer=/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
		"__hip_gpubin_handle");
		traUnsubmitted Done Reply Inline Actions Given that it's HIP-only code, there will be no `cuda`. tra: Given that it's HIP-only code, there will be no `cuda`.
		rjmccallUnsubmitted Done Reply Inline Actions Should you just make `GpuBinaryHandle` an `Address` so that you don't have to repeat the alignment assumption over and over? Also, you should set an alignment on the variable itself. rjmccall: Should you just make `GpuBinaryHandle` an `Address` so that you don't have to repeat the…
		yaxunlAuthorUnsubmitted Not Done Reply Inline Actions will do yaxunl: will do
		GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
		Address GpuBinaryAddr(
		GpuBinaryHandle,
		CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
		rjmccallUnsubmitted Done Reply Inline Actions When I'm generating control flow like this, I find it helpful to at least use vertical spacing to separate the blocks, and sometimes I even put all the code within a block in a brace-statement (`{ ... }`) to more clearly scope the block-local values within the block. rjmccall: When I'm generating control flow like this, I find it helpful to at least use vertical spacing…
		yaxunlAuthorUnsubmitted Not Done Reply Inline Actions will do yaxunl: will do
		{
		auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
		llvm::Constant *Zero =
		llvm::Constant::getNullValue(HandleValue->getType());
		llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
		CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
		}
		{
		CtorBuilder.SetInsertPoint(IfBlock);
		// GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
		llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
		RegisterFatbinFunc,
		CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
		CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
		rjmccallUnsubmitted Not Done Reply Inline Actions I meant more putting all the code for IfBlock in a brace-statement, that kind of thing. It's not as important for the earlier stuff because that actually dominates the rest of your code here. rjmccall: I meant more putting all the code for IfBlock in a brace-statement, that kind of thing. It's…
		CtorBuilder.CreateBr(ExitBlock);
		}
		{
		CtorBuilder.SetInsertPoint(ExitBlock);
		// Call __hip_register_globals(GpuBinaryHandle);
		if (RegisterGlobalsFunc) {
		auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
		CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
		}
		}
		} else if (!RelocatableDeviceCode) {
		// Register binary with CUDA runtime. This is substantially different in
// default mode vs. separate compilation!		// default mode vs. separate compilation!
if (!RelocatableDeviceCode) {		// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
// GpuBinaryHandle = __{cuda\|hip}RegisterFatBinary(&FatbinWrapper);
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(		llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
RegisterFatbinFunc,		RegisterFatbinFunc,
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));		CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
GpuBinaryHandle = new llvm::GlobalVariable(		GpuBinaryHandle = new llvm::GlobalVariable(
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,		TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
llvm::ConstantPointerNull::get(VoidPtrPtrTy),		llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
addUnderscoredPrefixToName("_gpubin_handle"));		GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());

CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,		CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
CGM.getPointerAlign());		CGM.getPointerAlign());

// Call __{cuda\|hip}_register_globals(GpuBinaryHandle);		// Call __cuda_register_globals(GpuBinaryHandle);
if (RegisterGlobalsFunc)		if (RegisterGlobalsFunc)
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);		CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
} else {		} else {
// Generate a unique module ID.		// Generate a unique module ID.
SmallString<64> ModuleID;		SmallString<64> ModuleID;
llvm::raw_svector_ostream OS(ModuleID);		llvm::raw_svector_ostream OS(ModuleID);
OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID());		OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID());
llvm::Constant *ModuleIDConstant =		llvm::Constant *ModuleIDConstant =
makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);		makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);

// Create an alias for the FatbinWrapper that nvcc or hip backend will		// Create an alias for the FatbinWrapper that nvcc will look for.
// look for.
llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,		llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
Twine("__fatbinwrap") + ModuleID, FatbinWrapper);		Twine("__fatbinwrap") + ModuleID, FatbinWrapper);

// void __{cuda\|hip}RegisterLinkedBinary%ModuleID%(void ()(void ), void *,		// void __cudaRegisterLinkedBinary%ModuleID%(void ()(void ), void *,
// void , void ()(void **))		// void , void ()(void **))
SmallString<128> RegisterLinkedBinaryName(		SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
addUnderscoredPrefixToName("RegisterLinkedBinary"));
RegisterLinkedBinaryName += ModuleID;		RegisterLinkedBinaryName += ModuleID;
llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(		llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);		getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);

assert(RegisterGlobalsFunc && "Expecting at least dummy function!");		assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
llvm::Value *Args[] = {RegisterGlobalsFunc,		llvm::Value *Args[] = {RegisterGlobalsFunc,
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),		CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
ModuleIDConstant,		ModuleIDConstant,
Show All 15 Lines	llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
}		}

CtorBuilder.CreateRetVoid();		CtorBuilder.CreateRetVoid();
return ModuleCtorFunc;		return ModuleCtorFunc;
}		}

/// Creates a global destructor function that unregisters the GPU code blob		/// Creates a global destructor function that unregisters the GPU code blob
/// registered by constructor.		/// registered by constructor.
		///
		/// For CUDA:
/// \code		/// \code
/// void __cuda_module_dtor(void*) {		/// void __cuda_module_dtor(void*) {
/// __cudaUnregisterFatBinary(Handle);		/// __cudaUnregisterFatBinary(Handle);
/// }		/// }
/// \endcode		/// \endcode
		///
		/// For HIP:
		/// \code
		/// void __hip_module_dtor(void*) {
		/// if (__hip_gpubin_handle) {
		/// __hipUnregisterFatBinary(__hip_gpubin_handle);
		/// __hip_gpubin_handle = 0;
		/// }
		/// }
		/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {		llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
// No need for destructor if we don't have a handle to unregister.		// No need for destructor if we don't have a handle to unregister.
if (!GpuBinaryHandle)		if (!GpuBinaryHandle)
return nullptr;		return nullptr;

// void __cudaUnregisterFatBinary(void ** handle);		// void __cudaUnregisterFatBinary(void ** handle);
llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(		llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),		llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
addUnderscoredPrefixToName("UnregisterFatBinary"));		addUnderscoredPrefixToName("UnregisterFatBinary"));

llvm::Function *ModuleDtorFunc = llvm::Function::Create(		llvm::Function *ModuleDtorFunc = llvm::Function::Create(
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),		llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
llvm::GlobalValue::InternalLinkage,		llvm::GlobalValue::InternalLinkage,
addUnderscoredPrefixToName("_module_dtor"), &TheModule);		addUnderscoredPrefixToName("_module_dtor"), &TheModule);

llvm::BasicBlock *DtorEntryBB =		llvm::BasicBlock *DtorEntryBB =
llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);		llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
CGBuilderTy DtorBuilder(CGM, Context);		CGBuilderTy DtorBuilder(CGM, Context);
DtorBuilder.SetInsertPoint(DtorEntryBB);		DtorBuilder.SetInsertPoint(DtorEntryBB);

auto HandleValue =		Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());		GpuBinaryHandle->getAlignment()));
		auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
		// There is only one HIP fat binary per linked module, however there are
		// multiple destructor functions. Make sure the fat binary is unregistered
		// only once.
		if (CGM.getLangOpts().HIP) {
		llvm::BasicBlock *IfBlock =
		llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
		llvm::BasicBlock *ExitBlock =
		llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
		llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
		llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
		DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);

		DtorBuilder.SetInsertPoint(IfBlock);
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);		DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
		DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
		DtorBuilder.CreateBr(ExitBlock);

		DtorBuilder.SetInsertPoint(ExitBlock);
		} else {
		DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
		}
DtorBuilder.CreateRetVoid();		DtorBuilder.CreateRetVoid();
return ModuleDtorFunc;		return ModuleDtorFunc;
}		}

CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {		CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
return new CGNVCUDARuntime(CGM);		return new CGNVCUDARuntime(CGM);
}		}
		rjmccallUnsubmitted Done Reply Inline Actions Don't just indent stuff if you're not putting it in a brace-statement. By "vertical space" I meant putting newlines between the emission of the different blocks. rjmccall: Don't just indent stuff if you're not putting it in a brace-statement. By "vertical space" I…
		yaxunlAuthorUnsubmitted Not Done Reply Inline Actions sorry I misunderstood. will fix. yaxunl: sorry I misunderstood. will fix.

test/CodeGenCUDA/device-stub.cu

	Show All 13 Lines
	// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \			// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
	// RUN: -fcuda-include-gpubinary %t -o - -x hip\			// RUN: -fcuda-include-gpubinary %t -o - -x hip\
	// RUN: \| FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP			// RUN: \| FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
	// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \			// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
	// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS -x hip \			// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS -x hip \
	// RUN: \| FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS			// RUN: \| FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
	// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \			// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
	// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \			// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
	// RUN: \| FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,HIP,HIPRDC			// RUN: \| FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
	// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\			// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
	// RUN: \| FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN			// RUN: \| FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN

	#include "Inputs/cuda.h"			#include "Inputs/cuda.h"

	#ifndef NOGLOBALS			#ifndef NOGLOBALS
	// ALL-DAG: @device_var = internal global i32			// ALL-DAG: @device_var = internal global i32
	__device__ int device_var;			__device__ int device_var;
	▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
	// CUDA-SAME: { i32 1180844977, i32 1,			// CUDA-SAME: { i32 1180844977, i32 1,
	// HIP-SAME: { i32 1212764230, i32 1,			// HIP-SAME: { i32 1212764230, i32 1,
	// CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),			// CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
	// HIP-SAME: i8* @[[FATBIN]],			// HIP-SAME: i8* @[[FATBIN]],
	// ALL-SAME: i8* null }			// ALL-SAME: i8* null }
	// CUDA-SAME: section ".nvFatBinSegment"			// CUDA-SAME: section ".nvFatBinSegment"
	// HIP-SAME: section ".hipFatBinSegment"			// HIP-SAME: section ".hipFatBinSegment"
	// * variable to save GPU binary handle after initialization			// * variable to save GPU binary handle after initialization
	// NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null			// CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
				// HIP: @__[[PREFIX]]_gpubin_handle = linkonce global i8** null
	// * constant unnamed string with NVModuleID			// * constant unnamed string with NVModuleID
	// RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant			// RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
	// CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32			// CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
	// HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32
	// * Make sure our constructor was added to global ctor list.			// * Make sure our constructor was added to global ctor list.
	// ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor			// ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
	// * Alias to global symbol containing the NVModuleID.			// * Alias to global symbol containing the NVModuleID.
	// RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8, i8 }			// RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8, i8 }
	// RDC-SAME: { i32, i32, i8, i8 }* @__[[PREFIX]]_fatbin_wrapper			// RDC-SAME: { i32, i32, i8, i8 }* @__[[PREFIX]]_fatbin_wrapper

	// Test that we build the correct number of calls to cudaSetupArgument followed			// Test that we build the correct number of calls to cudaSetupArgument followed
	// by a call to cudaLaunch.			// by a call to cudaLaunch.
	Show All 20 Lines
	// ALL-DAG: call{{.}}[[PREFIX]]RegisterVar(i8* %0, {{.}}ext_device_var{{.}}i32 1, i32 4, i32 0, i32 0			// ALL-DAG: call{{.}}[[PREFIX]]RegisterVar(i8* %0, {{.}}ext_device_var{{.}}i32 1, i32 4, i32 0, i32 0
	// ALL-DAG: call{{.}}[[PREFIX]]RegisterVar(i8* %0, {{.}}ext_constant_var{{.}}i32 1, i32 4, i32 1, i32 0			// ALL-DAG: call{{.}}[[PREFIX]]RegisterVar(i8* %0, {{.}}ext_constant_var{{.}}i32 1, i32 4, i32 1, i32 0
	// ALL: ret void			// ALL: ret void

	// Test that we've built a constructor.			// Test that we've built a constructor.
	// ALL: define internal void @__[[PREFIX]]_module_ctor			// ALL: define internal void @__[[PREFIX]]_module_ctor

	// In separate mode it calls __[[PREFIX]]RegisterFatBinary(&__[[PREFIX]]_fatbin_wrapper)			// In separate mode it calls __[[PREFIX]]RegisterFatBinary(&__[[PREFIX]]_fatbin_wrapper)
				// HIP only register fat binary once.
				// HIP: load i8, i8* @__hip_gpubin_handle
				// HIP-NEXT: icmp eq i8** {{.*}}, null
				// HIP-NEXT: br i1 {{.*}}, label %if, label %exit
				// HIP: if:
	// NORDC: call{{.}}[[PREFIX]]RegisterFatBinary{{.}}__[[PREFIX]]_fatbin_wrapper			// NORDC: call{{.}}[[PREFIX]]RegisterFatBinary{{.}}__[[PREFIX]]_fatbin_wrapper
	// .. stores return value in __[[PREFIX]]_gpubin_handle			// .. stores return value in __[[PREFIX]]_gpubin_handle
	// NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle			// NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
	// .. and then calls __[[PREFIX]]_register_globals			// .. and then calls __[[PREFIX]]_register_globals
				// HIP-NEXT: br label %exit
				// HIP: exit:
				// HIP-NEXT: load i8, i8* @__hip_gpubin_handle
	// NORDC-NEXT: call void @__[[PREFIX]]_register_globals			// NORDC-NEXT: call void @__[[PREFIX]]_register_globals
	// * In separate mode we also register a destructor.			// * In separate mode we also register a destructor.
	// NORDC-NEXT: call i32 @atexit(void (i8) @__[[PREFIX]]_module_dtor)			// NORDC-NEXT: call i32 @atexit(void (i8) @__[[PREFIX]]_module_dtor)

	// With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID%			// With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID%
	// RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](			// RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](
	// RDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper			// RDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper
	// RDC-SAME: [[MODULE_ID_GLOBAL]]			// RDC-SAME: [[MODULE_ID_GLOBAL]]

	// Test that we've created destructor.			// Test that we've created destructor.
	// NORDC: define internal void @__[[PREFIX]]_module_dtor			// NORDC: define internal void @__[[PREFIX]]_module_dtor
	// NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle			// NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
	// NORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary			// CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
				// HIP-NEXT: icmp ne i8** {{.*}}, null
				// HIP-NEXT: br i1 {{.*}}, label %if, label %exit
				// HIP: if:
				// HIP-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
				// HIP-NEXT: store i8 null, i8* @__hip_gpubin_handle
				// HIP-NEXT: br label %exit
				// HIP: exit:

	// There should be no __[[PREFIX]]_register_globals if we have no			// There should be no __[[PREFIX]]_register_globals if we have no
	// device-side globals, but we still need to register GPU binary.			// device-side globals, but we still need to register GPU binary.
	// Skip GPU binary string first.			// Skip GPU binary string first.
	// CUDANOGLOBALS: @{{.}} = private constant{{.}}			// CUDANOGLOBALS: @{{.}} = private constant{{.}}
	// HIPNOGLOBALS: @{{.}} = external constant{{.}}			// HIPNOGLOBALS: @{{.}} = external constant{{.}}
	// NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals			// NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals
	// NOGLOBALS: define internal void @__[[PREFIX:cuda\|hip]]_module_ctor			// NOGLOBALS: define internal void @__[[PREFIX:cuda\|hip]]_module_ctor
	Show All 9 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[HIP] Register/unregister device fat binary only once
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 156626

lib/CodeGen/CGCUDANV.cpp

test/CodeGenCUDA/device-stub.cu

This is an archive of the discontinued LLVM Phabricator instance.

[HIP] Register/unregister device fat binary only onceClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 156626

lib/CodeGen/CGCUDANV.cpp

test/CodeGenCUDA/device-stub.cu

[HIP] Register/unregister device fat binary only once
ClosedPublic