Diff 428622

clang/test/Driver/linker-wrapper-image.c

	Show All 21 Lines
	// OPENMP-NEXT: ret void			// OPENMP-NEXT: ret void
	// OPENMP-NEXT: }			// OPENMP-NEXT: }

	// OPENMP: define internal void @.omp_offloading.descriptor_unreg() section ".text.startup" {			// OPENMP: define internal void @.omp_offloading.descriptor_unreg() section ".text.startup" {
	// OPENMP-NEXT: entry:			// OPENMP-NEXT: entry:
	// OPENMP-NEXT: call void @__tgt_unregister_lib(%__tgt_bin_desc* @.omp_offloading.descriptor)			// OPENMP-NEXT: call void @__tgt_unregister_lib(%__tgt_bin_desc* @.omp_offloading.descriptor)
	// OPENMP-NEXT: ret void			// OPENMP-NEXT: ret void
	// OPENMP-NEXT: }			// OPENMP-NEXT: }

				// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
				// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_70
				yaxunlUnsubmitted Not Done Reply Inline Actions what happens if there are multiple binaries for different GPUs? will the linker-wrapper generates one fatbinary containing both elfs and embed the fatbinary as one image? yaxunl: what happens if there are multiple binaries for different GPUs? will the linker-wrapper…
				jhuber6AuthorUnsubmitted Done Reply Inline Actions Yes, I'll add it to the other test. jhuber6: Yes, I'll add it to the other test.
				// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple x86_64-unknown-linux-gnu \
				// RUN: -linker-path /usr/bin/ld -- %t.o -o a.out 2>&1 \| FileCheck %s --check-prefix=CUDA

				// CUDA: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin"
				// CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, i8* getelementptr inbounds ([0 x i8], [0 x i8]* @.fatbin_image, i32 0, i32 0), i8* null }, section ".nvFatBinSegment", align 8
				// CUDA-NEXT: @__dummy.cuda_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "cuda_offloading_entries"
				// CUDA-NEXT: @.cuda.binary_handle = internal global i8** null
				// CUDA-NEXT: @__start_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
				// CUDA-NEXT: @__stop_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
				// CUDA-NEXT: @llvm.global_ctors = appending global [1 x { i32, void (), i8 }] [{ i32, void (), i8 } { i32 1, void ()* @.cuda.fatbin_reg, i8* null }]

				// CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" {
				// CUDA-NEXT: entry:
				// CUDA-NEXT: %0 = call i8** @__cudaRegisterFatBinary(i8* bitcast (%fatbin_wrapper* @.fatbin_wrapper to i8*))
				// CUDA-NEXT: store i8 %0, i8* @.cuda.binary_handle, align 8
				// CUDA-NEXT: call void @.cuda.globals_reg(i8** %0)
				// CUDA-NEXT: call void @__cudaRegisterFatBinaryEnd(i8** %0)
				// CUDA-NEXT: %1 = call i32 @atexit(void ()* @.cuda.fatbin_unreg)
				// CUDA-NEXT: ret void
				// CUDA-NEXT: }

				// CUDA: define internal void @.cuda.fatbin_unreg() section ".text.startup" {
				// CUDA-NEXT: entry:
				// CUDA-NEXT: %0 = load i8, i8* @.cuda.binary_handle, align 8
				// CUDA-NEXT: call void @__cudaUnregisterFatBinary(i8** %0)
				// CUDA-NEXT: ret void
				// CUDA-NEXT: }

				// CUDA: define internal void @.cuda.globals_reg(i8** %0) section ".text.startup" {
				// CUDA-NEXT: entry:
				// CUDA-NEXT: br i1 icmp ne ([0 x %__tgt_offload_entry]* @__start_cuda_offloading_entries, [0 x %__tgt_offload_entry]* @__stop_cuda_offloading_entries), label %while.entry, label %while.end

				// CUDA: while.entry:
				// CUDA-NEXT: %entry1 = phi %__tgt_offload_entry* [ getelementptr inbounds ([0 x %__tgt_offload_entry], [0 x %__tgt_offload_entry]* @__start_cuda_offloading_entries, i64 0, i64 0), %entry ], [ %7, %if.end ]
				// CUDA-NEXT: %1 = getelementptr inbounds %__tgt_offload_entry, %__tgt_offload_entry* %entry1, i64 0, i32 0
				// CUDA-NEXT: %addr = load i8, i8* %1, align 8
				// CUDA-NEXT: %2 = getelementptr inbounds %__tgt_offload_entry, %__tgt_offload_entry* %entry1, i64 0, i32 1
				// CUDA-NEXT: %name = load i8, i8* %2, align 8
				// CUDA-NEXT: %3 = getelementptr inbounds %__tgt_offload_entry, %__tgt_offload_entry* %entry1, i64 0, i32 2
				// CUDA-NEXT: %size = load i64, i64* %3, align 4
				// CUDA-NEXT: %4 = icmp eq i64 %size, 0
				// CUDA-NEXT: br i1 %4, label %if.then, label %if.else

				// CUDA: if.then:
				// CUDA-NEXT: %5 = call i32 @__cudaRegisterFunction(i8** %0, i8* %addr, i8* %name, i8* %name, i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
				// CUDA-NEXT: br label %if.end

				// CUDA: if.else:
				// CUDA-NEXT: %6 = call i32 @__cudaRegisterVar(i8** %0, i8* %addr, i8* %name, i8* %name, i32 0, i64 %size, i32 0, i32 0)
				// CUDA-NEXT: br label %if.end

				// CUDA: if.end:
				// CUDA-NEXT: %7 = getelementptr inbounds %__tgt_offload_entry, %__tgt_offload_entry* %entry1, i64 1
				// CUDA-NEXT: %8 = icmp eq %__tgt_offload_entry* %7, getelementptr inbounds ([0 x %__tgt_offload_entry], [0 x %__tgt_offload_entry]* @__stop_cuda_offloading_entries, i64 0, i64 0)
				// CUDA-NEXT: br i1 %8, label %while.end, label %while.entry

				// CUDA: while.end:
				// CUDA-NEXT: ret void
				// CUDA-NEXT: }

clang/test/Driver/linker-wrapper.c

	Show All 36 Lines
	// RUN: -fembed-offload-object=%S/Inputs/dummy-bc.bc,openmp,nvptx64-nvida-cuda,sm_70			// RUN: -fembed-offload-object=%S/Inputs/dummy-bc.bc,openmp,nvptx64-nvida-cuda,sm_70
	// RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \			// RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \
	// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 \| FileCheck %s --check-prefix=LTO			// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 \| FileCheck %s --check-prefix=LTO

	// LTO: ptxas{{.}}-m64 -o {{.}}.cubin -O2 --gpu-name sm_70 {{.*}}.s			// LTO: ptxas{{.}}-m64 -o {{.}}.cubin -O2 --gpu-name sm_70 {{.*}}.s
	// LTO-NOT: nvlink			// LTO-NOT: nvlink

	// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \			// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
	// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,openmp,nvptx64-nvida-cuda,sm_70 \			// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,openmp,nvptx64-nvida-cuda,sm_70 \
				yaxunlUnsubmitted Not Done Reply Inline Actions This option is the same as the preceding option. Is this intentional? Can we have a test that embeds multiple binaries for different GPUs? yaxunl: This option is the same as the preceding option. Is this intentional? Can we have a test that…
				jhuber6AuthorUnsubmitted Done Reply Inline Actions It's intentional to show that we can pull out two objects embedded in a single file (Like if someone did `ld -r` or something). I'll add binaries for different GPUs to show that works. jhuber6: It's intentional to show that we can pull out two objects embedded in a single file (Like if…
	// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_70			// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_70
	// RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \			// RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \
	// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 \| FileCheck %s --check-prefix=CUDA_OMP_LINK			// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 \| FileCheck %s --check-prefix=CUDA_OMP_LINK

	// CUDA_OMP_LINK: nvlink{{.}}-m64 -o {{.}}.out -arch sm_70 {{.}}.o {{.}}.o			// CUDA_OMP_LINK: nvlink{{.}}-m64 -o {{.}}.out -arch sm_70 {{.}}.o {{.}}.o

	// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t-lib.o \			// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t-lib.o \
	// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,openmp,nvptx64-nvida-cuda,sm_70 \			// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,openmp,nvptx64-nvida-cuda,sm_70 \
	// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,openmp,nvptx64-nvida-cuda,sm_52			// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,openmp,nvptx64-nvida-cuda,sm_52
	// RUN: llvm-ar rcs %t.a %t-lib.o			// RUN: llvm-ar rcs %t.a %t-lib.o
	// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t-obj.o \			// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t-obj.o \
	// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,openmp,nvptx64-nvida-cuda,sm_70			// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,openmp,nvptx64-nvida-cuda,sm_70
	// RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \			// RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \
	// RUN: /usr/bin/ld -- %t.a %t-obj.o -o a.out 2>&1 \| FileCheck %s --check-prefix=STATIC-LIBRARY			// RUN: /usr/bin/ld -- %t.a %t-obj.o -o a.out 2>&1 \| FileCheck %s --check-prefix=STATIC-LIBRARY

	// STATIC-LIBRARY: nvlink{{.*}} -arch sm_70			// STATIC-LIBRARY: nvlink{{.*}} -arch sm_70
	// STATIC-LIBRARY-NOT: nvlink{{.*}} -arch sm_50			// STATIC-LIBRARY-NOT: nvlink{{.*}} -arch sm_50

				// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
				// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_70 \
				// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,openmp,nvptx64-nvida-cuda,sm_70 \
				// RUN: -fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_52
				// RUN: clang-linker-wrapper --dry-run --host-triple x86_64-unknown-linux-gnu -linker-path \
				// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 \| FileCheck %s --check-prefix=CUDA

				// CUDA: nvlink{{.}}-m64 -o {{.}}.out -arch sm_70 {{.}}.o {{.}}.o
				// CUDA: nvlink{{.}}-m64 -o {{.}}.out -arch sm_52 {{.*}}.o
				// CUDA: fatbinary{{.}}-64 --create {{.}}.fatbin --image=profile=sm_70,file={{.}}.out --image=profile=sm_52,file={{.}}.out

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Show First 20 Lines • Show All 1,507 Lines • ▼ Show 20 Lines	int main(int argc, const char **argv) {
if (Error Err = linkDeviceFiles(DeviceFiles, LinkedImages))		if (Error Err = linkDeviceFiles(DeviceFiles, LinkedImages))
return reportError(std::move(Err));		return reportError(std::move(Err));

// Wrap each linked device image into a linkable host binary and add it to the		// Wrap each linked device image into a linkable host binary and add it to the
// link job's inputs.		// link job's inputs.
auto FileOrErr = wrapDeviceImages(LinkedImages);		auto FileOrErr = wrapDeviceImages(LinkedImages);
if (!FileOrErr)		if (!FileOrErr)
return reportError(FileOrErr.takeError());		return reportError(FileOrErr.takeError());
LinkerArgs.append(*FileOrErr);
		// We need to insert the new files next to the old ones to make sure they're
		// linked with the same libraries / arguments.
		if (!FileOrErr->empty()) {
		auto FirstInput = std::next(llvm::find_if(LinkerArgs, [](StringRef Str) {
		return sys::fs::exists(Str) && !sys::fs::is_directory(Str) &&
		Str != ExecutableName;
		}));
		LinkerArgs.insert(FirstInput, FileOrErr->begin(), FileOrErr->end());
		}

// Run the host linking job.		// Run the host linking job.
if (Error Err = runLinker(LinkerUserPath, LinkerArgs))		if (Error Err = runLinker(LinkerUserPath, LinkerArgs))
return reportError(std::move(Err));		return reportError(std::move(Err));

// Remove the temporary files created.		// Remove the temporary files created.
for (const auto &TempFile : TempFiles)		for (const auto &TempFile : TempFiles)
if (std::error_code EC = sys::fs::remove(TempFile))		if (std::error_code EC = sys::fs::remove(TempFile))
reportError(createFileError(TempFile, EC));		reportError(createFileError(TempFile, EC));

return EXIT_SUCCESS;		return EXIT_SUCCESS;
}		}

clang/tools/clang-linker-wrapper/OffloadWrapper.cpp

Show All 14 Lines
#include "llvm/IR/LLVMContext.h"		#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"		#include "llvm/IR/Module.h"
#include "llvm/Support/Error.h"		#include "llvm/Support/Error.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"		#include "llvm/Transforms/Utils/ModuleUtils.h"

using namespace llvm;		using namespace llvm;

namespace {		namespace {
		/// Magic number that begins the section containing the CUDA fatbinary.
		constexpr unsigned CudaFatMagic = 0x466243b1;

IntegerType *getSizeTTy(Module &M) {		IntegerType *getSizeTTy(Module &M) {
LLVMContext &C = M.getContext();		LLVMContext &C = M.getContext();
switch (M.getDataLayout().getPointerTypeSize(Type::getInt8PtrTy(C))) {		switch (M.getDataLayout().getPointerTypeSize(Type::getInt8PtrTy(C))) {
case 4u:		case 4u:
return Type::getInt32Ty(C);		return Type::getInt32Ty(C);
case 8u:		case 8u:
return Type::getInt64Ty(C);		return Type::getInt64Ty(C);
▲ Show 20 Lines • Show All 219 Lines • ▼ Show 20 Lines	void createUnregisterFunction(Module &M, GlobalVariable *BinDesc) {
Builder.CreateCall(UnRegFuncC, BinDesc);		Builder.CreateCall(UnRegFuncC, BinDesc);
Builder.CreateRetVoid();		Builder.CreateRetVoid();

// Add this function to global destructors.		// Add this function to global destructors.
// Match priority of __tgt_register_lib		// Match priority of __tgt_register_lib
appendToGlobalDtors(M, Func, /Priority/ 1);		appendToGlobalDtors(M, Func, /Priority/ 1);
}		}

		// struct fatbin_wrapper {
		// int32_t magic;
		// int32_t version;
		// void *image;
		// void *reserved;
		//};
		StructType *getFatbinWrapperTy(Module &M) {
		LLVMContext &C = M.getContext();
		StructType *FatbinTy = StructType::getTypeByName(C, "fatbin_wrapper");
		if (!FatbinTy)
		FatbinTy = StructType::create("fatbin_wrapper", Type::getInt32Ty(C),
		Type::getInt32Ty(C), Type::getInt8PtrTy(C),
		Type::getInt8PtrTy(C));
		return FatbinTy;
		}

		/// Embed the image \p Image into the module \p M so it can be found by the
		/// runtime.
		GlobalVariable *createFatbinDesc(Module &M, ArrayRef<char> Image) {
		LLVMContext &C = M.getContext();
		llvm::Type *Int8PtrTy = Type::getInt8PtrTy(C);
		llvm::Triple Triple = llvm::Triple(M.getTargetTriple());

		// Create the global string containing the fatbinary.
		StringRef FatbinConstantSection =
		Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
		auto *Data = ConstantDataArray::get(C, Image);
		auto Fatbin = new GlobalVariable(M, Data->getType(), /isConstant*/ true,
		GlobalVariable::InternalLinkage, Data,
		".fatbin_image");
		Fatbin->setSection(FatbinConstantSection);

		// Create the fatbinary wrapper
		StringRef FatbinWrapperSection =
		Triple.isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
		Constant *FatbinWrapper[] = {
		ConstantInt::get(Type::getInt32Ty(C), CudaFatMagic),
		ConstantInt::get(Type::getInt32Ty(C), 1),
		ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy),
		ConstantPointerNull::get(Type::getInt8PtrTy(C))};

		Constant *FatbinInitializer =
		ConstantStruct::get(getFatbinWrapperTy(M), FatbinWrapper);

		auto *FatbinDesc =
		new GlobalVariable(M, getFatbinWrapperTy(M),
		/isConstant/ true, GlobalValue::InternalLinkage,
		FatbinInitializer, ".fatbin_wrapper");
		FatbinDesc->setSection(FatbinWrapperSection);
		FatbinDesc->setAlignment(Align(8));

		// We create a dummy entry to ensure the linker will define the begin / end
		// symbols. The CUDA runtime should ignore the null address if we attempt to
		// register it.
		auto *DummyInit =
		ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
		auto *DummyEntry = new GlobalVariable(
		M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
		"__dummy.cuda_offloading.entry");
		DummyEntry->setSection("cuda_offloading_entries");
		DummyEntry->setVisibility(GlobalValue::HiddenVisibility);

		return FatbinDesc;
		}

		/// Create the register globals function. We will iterate all of the offloading
		/// entries stored at the begin / end symbols and register them according to
		/// their type. This creates the following function in IR:
		///
		/// extern struct __tgt_offload_entry __start_cuda_offloading_entries;
		/// extern struct __tgt_offload_entry __stop_cuda_offloading_entries;
		///
		/// extern void __cudaRegisterFunction(void *, void , void , void , int,
		/// void , void , void , void , int *);
		/// extern void __cudaRegisterVar(void *, void , void , void , int32_t,
		/// int64_t, int32_t, int32_t);
		///
		/// void __cudaRegisterTest(void **fatbinHandle) {
		/// for (struct __tgt_offload_entry *entry = &__start_cuda_offloading_entries;
		/// entry != &__stop_cuda_offloading_entries; ++entry) {
		/// if (!entry->size)
		/// __cudaRegisterFunction(fatbinHandle, entry->addr, entry->name,
		/// entry->name, -1, 0, 0, 0, 0, 0);
		/// else
		/// __cudaRegisterVar(fatbinHandle, entry->addr, entry->name, entry->name,
		/// 0, entry->size, 0, 0);
		/// }
		/// }
		///
		/// TODO: This only registers functions are variables. Additional support is
		/// required for texture / surface / managed variables.
		Function *createRegisterGlobalsFunction(Module &M) {
		traUnsubmitted Not Done Reply Inline Actions Do you think generation of the CUDA registration glue could be shared with the front-end? tra: Do you think generation of the CUDA registration glue could be shared with the front-end?
		jhuber6AuthorUnsubmitted Done Reply Inline Actions I was thinking about it, but ultimately decided to keep the noise outside of the new driver to a minimum. Maybe if we move to the offloading entries being a common format we can easily share this code. Keeping it in Clang would have the advantage that it's easier to test directly and ensures we don't de-sync if anything changes. The only downside is that in the future I may want to push this functionality to a linker plugin or similar, which would require pulling it out of Clang again to prevent us needing to link in Clang to build LLVM. Also needing to do this all through the builder API isn't ideal, it would be nice if we had some kind of runtime to call to do this for us, but I didn't feel like adding yet another shared library for CUDA. I considered putting it inside the cuda header wrappers as well, but forcing every CUDA file to have some externally visible weak registration blob didn't sit well with me. jhuber6: I was thinking about it, but ultimately decided to keep the noise outside of the new driver to…
		traUnsubmitted Not Done Reply Inline Actions Perhaps front-end is not the right place for it, indeed. LLVM itself may be a better choice. We already have some things there for somewhat similar purposes (like lib/WindowsManifest) so adding a helper function to generate runtime glue for CUDA should not be unreasonable. tra: Perhaps front-end is not the right place for it, indeed. LLVM itself may be a better choice. We…
		jhuber6AuthorUnsubmitted Done Reply Inline Actions I think it's fine here for this patch, but I definitely want to move it into LLVM in the future once I start generalizing more of this stuff. jhuber6: I think it's fine here for this patch, but I definitely want to move it into LLVM in the future…
		jdoerfertUnsubmitted Not Done Reply Inline Actions I'm OK with it being here but the place to consider (IMHO) is `llvm/lib/Frontend`, maybe `/CUDA/Register.cpp`. jdoerfert: I'm OK with it being here but the place to consider (IMHO) is `llvm/lib/Frontend`, maybe…
		traUnsubmitted Not Done Reply Inline Actions OK. I'm fine keeping it all here for now. Please add a comment pointing towards the origin of this code. and, maybe, a TODO item to consolidate and move it into a better place. tra: OK. I'm fine keeping it all here for now. Please add a comment pointing towards the origin of…
		jhuber6AuthorUnsubmitted Done Reply Inline Actions Will do, thanks for the reviews. I'll land these tomorrow morning and see if anything breaks. jhuber6: Will do, thanks for the reviews. I'll land these tomorrow morning and see if anything breaks.
		LLVMContext &C = M.getContext();
		// Get the __cudaRegisterFunction function declaration.
		auto *RegFuncTy = FunctionType::get(
		Type::getInt32Ty(C),
		{Type::getInt8PtrTy(C)->getPointerTo(), Type::getInt8PtrTy(C),
		Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), Type::getInt32Ty(C),
		Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), Type::getInt8PtrTy(C),
		Type::getInt8PtrTy(C), Type::getInt32PtrTy(C)},
		/isVarArg/ false);
		FunctionCallee RegFunc =
		M.getOrInsertFunction("__cudaRegisterFunction", RegFuncTy);

		// Get the __cudaRegisterVar function declaration.
		auto *RegVarTy = FunctionType::get(
		Type::getInt32Ty(C),
		{Type::getInt8PtrTy(C)->getPointerTo(), Type::getInt8PtrTy(C),
		Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), Type::getInt32Ty(C),
		getSizeTTy(M), Type::getInt32Ty(C), Type::getInt32Ty(C)},
		/isVarArg/ false);
		FunctionCallee RegVar = M.getOrInsertFunction("__cudaRegisterVar", RegVarTy);

		// Create the references to the start / stop symbols defined by the linker.
		auto *EntriesB = new GlobalVariable(
		M, ArrayType::get(getEntryTy(M), 0), /isConstant/ true,
		GlobalValue::ExternalLinkage,
		/Initializer/ nullptr, "__start_cuda_offloading_entries");
		EntriesB->setVisibility(GlobalValue::HiddenVisibility);
		auto *EntriesE = new GlobalVariable(
		M, ArrayType::get(getEntryTy(M), 0), /isConstant/ true,
		GlobalValue::ExternalLinkage,
		/Initializer/ nullptr, "__stop_cuda_offloading_entries");
		EntriesE->setVisibility(GlobalValue::HiddenVisibility);

		auto *RegGlobalsTy = FunctionType::get(Type::getVoidTy(C),
		Type::getInt8PtrTy(C)->getPointerTo(),
		/isVarArg/ false);
		auto *RegGlobalsFn = Function::Create(
		RegGlobalsTy, GlobalValue::InternalLinkage, ".cuda.globals_reg", &M);
		RegGlobalsFn->setSection(".text.startup");

		// Create the loop to register all the entries.
		IRBuilder<> Builder(BasicBlock::Create(C, "entry", RegGlobalsFn));
		auto *EntryBB = BasicBlock::Create(C, "while.entry", RegGlobalsFn);
		auto *IfThenBB = BasicBlock::Create(C, "if.then", RegGlobalsFn);
		auto *IfElseBB = BasicBlock::Create(C, "if.else", RegGlobalsFn);
		auto *IfEndBB = BasicBlock::Create(C, "if.end", RegGlobalsFn);
		auto *ExitBB = BasicBlock::Create(C, "while.end", RegGlobalsFn);

		auto *EntryCmp = Builder.CreateICmpNE(EntriesB, EntriesE);
		Builder.CreateCondBr(EntryCmp, EntryBB, ExitBB);
		Builder.SetInsertPoint(EntryBB);
		auto *Entry = Builder.CreatePHI(getEntryPtrTy(M), 2, "entry");
		auto *AddrPtr =
		Builder.CreateInBoundsGEP(getEntryTy(M), Entry,
		{ConstantInt::get(getSizeTTy(M), 0),
		ConstantInt::get(Type::getInt32Ty(C), 0)});
		auto *Addr = Builder.CreateLoad(Type::getInt8PtrTy(C), AddrPtr, "addr");
		auto *NamePtr =
		Builder.CreateInBoundsGEP(getEntryTy(M), Entry,
		{ConstantInt::get(getSizeTTy(M), 0),
		ConstantInt::get(Type::getInt32Ty(C), 1)});
		auto *Name = Builder.CreateLoad(Type::getInt8PtrTy(C), NamePtr, "name");
		auto *SizePtr =
		Builder.CreateInBoundsGEP(getEntryTy(M), Entry,
		{ConstantInt::get(getSizeTTy(M), 0),
		ConstantInt::get(Type::getInt32Ty(C), 2)});
		auto *Size = Builder.CreateLoad(getSizeTTy(M), SizePtr, "size");
		auto *FnCond =
		Builder.CreateICmpEQ(Size, ConstantInt::getNullValue(getSizeTTy(M)));
		Builder.CreateCondBr(FnCond, IfThenBB, IfElseBB);
		Builder.SetInsertPoint(IfThenBB);
		Builder.CreateCall(RegFunc,
		{RegGlobalsFn->arg_begin(), Addr, Name, Name,
		ConstantInt::get(Type::getInt32Ty(C), -1),
		ConstantPointerNull::get(Type::getInt8PtrTy(C)),
		ConstantPointerNull::get(Type::getInt8PtrTy(C)),
		ConstantPointerNull::get(Type::getInt8PtrTy(C)),
		ConstantPointerNull::get(Type::getInt8PtrTy(C)),
		ConstantPointerNull::get(Type::getInt32PtrTy(C))});
		Builder.CreateBr(IfEndBB);
		Builder.SetInsertPoint(IfElseBB);
		Builder.CreateCall(RegVar, {RegGlobalsFn->arg_begin(), Addr, Name, Name,
		ConstantInt::get(Type::getInt32Ty(C), 0), Size,
		ConstantInt::get(Type::getInt32Ty(C), 0),
		ConstantInt::get(Type::getInt32Ty(C), 0)});
		Builder.CreateBr(IfEndBB);
		Builder.SetInsertPoint(IfEndBB);
		auto *NewEntry = Builder.CreateInBoundsGEP(
		getEntryTy(M), Entry, ConstantInt::get(getSizeTTy(M), 1));
		auto *Cmp = Builder.CreateICmpEQ(
		NewEntry,
		ConstantExpr::getInBoundsGetElementPtr(
		ArrayType::get(getEntryTy(M), 0), EntriesE,
		ArrayRef<Constant *>({ConstantInt::get(getSizeTTy(M), 0),
		ConstantInt::get(getSizeTTy(M), 0)})));
		Entry->addIncoming(
		ConstantExpr::getInBoundsGetElementPtr(
		ArrayType::get(getEntryTy(M), 0), EntriesB,
		ArrayRef<Constant *>({ConstantInt::get(getSizeTTy(M), 0),
		ConstantInt::get(getSizeTTy(M), 0)})),
		&RegGlobalsFn->getEntryBlock());
		Entry->addIncoming(NewEntry, IfEndBB);
		Builder.CreateCondBr(Cmp, ExitBB, EntryBB);
		Builder.SetInsertPoint(ExitBB);
		Builder.CreateRetVoid();

		return RegGlobalsFn;
		}

		// Create the constructor and destructor to register the fatbinary with the CUDA
		// runtime.
		void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc) {
		LLVMContext &C = M.getContext();
		auto CtorFuncTy = FunctionType::get(Type::getVoidTy(C), /isVarArg*/ false);
		auto *CtorFunc = Function::Create(CtorFuncTy, GlobalValue::InternalLinkage,
		".cuda.fatbin_reg", &M);
		CtorFunc->setSection(".text.startup");

		auto DtorFuncTy = FunctionType::get(Type::getVoidTy(C), /isVarArg*/ false);
		auto *DtorFunc = Function::Create(DtorFuncTy, GlobalValue::InternalLinkage,
		".cuda.fatbin_unreg", &M);
		DtorFunc->setSection(".text.startup");

		// Get the __cudaRegisterFatBinary function declaration.
		auto *RegFatTy = FunctionType::get(Type::getInt8PtrTy(C)->getPointerTo(),
		Type::getInt8PtrTy(C),
		/isVarArg/ false);
		FunctionCallee RegFatbin =
		M.getOrInsertFunction("__cudaRegisterFatBinary", RegFatTy);
		// Get the __cudaRegisterFatBinaryEnd function declaration.
		auto *RegFatEndTy = FunctionType::get(Type::getVoidTy(C),
		Type::getInt8PtrTy(C)->getPointerTo(),
		/isVarArg/ false);
		FunctionCallee RegFatbinEnd =
		M.getOrInsertFunction("__cudaRegisterFatBinaryEnd", RegFatEndTy);
		// Get the __cudaUnregisterFatBinary function declaration.
		auto *UnregFatTy = FunctionType::get(Type::getVoidTy(C),
		Type::getInt8PtrTy(C)->getPointerTo(),
		/isVarArg/ false);
		FunctionCallee UnregFatbin =
		M.getOrInsertFunction("__cudaUnregisterFatBinary", UnregFatTy);

		auto *AtExitTy =
		FunctionType::get(Type::getInt32Ty(C), DtorFuncTy->getPointerTo(),
		/isVarArg/ false);
		FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy);

		auto *BinaryHandleGlobal = new llvm::GlobalVariable(
		M, Type::getInt8PtrTy(C)->getPointerTo(), false,
		llvm::GlobalValue::InternalLinkage,
		llvm::ConstantPointerNull::get(Type::getInt8PtrTy(C)->getPointerTo()),
		".cuda.binary_handle");

		// Create the constructor to register this image with the runtime.
		IRBuilder<> CtorBuilder(BasicBlock::Create(C, "entry", CtorFunc));
		CallInst *Handle = CtorBuilder.CreateCall(
		RegFatbin, ConstantExpr::getPointerBitCastOrAddrSpaceCast(
		FatbinDesc, Type::getInt8PtrTy(C)));
		CtorBuilder.CreateAlignedStore(
		Handle, BinaryHandleGlobal,
		Align(M.getDataLayout().getPointerTypeSize(Type::getInt8PtrTy(C))));
		CtorBuilder.CreateCall(createRegisterGlobalsFunction(M), Handle);
		CtorBuilder.CreateCall(RegFatbinEnd, Handle);
		CtorBuilder.CreateCall(AtExit, DtorFunc);
		CtorBuilder.CreateRetVoid();

		// Create the destructor to unregister the image with the runtime. We cannot
		// use a standard global destructor after CUDA 9.2 so this must be called by
		// `atexit()` intead.
		IRBuilder<> DtorBuilder(BasicBlock::Create(C, "entry", DtorFunc));
		LoadInst *BinaryHandle = DtorBuilder.CreateAlignedLoad(
		Type::getInt8PtrTy(C)->getPointerTo(), BinaryHandleGlobal,
		Align(M.getDataLayout().getPointerTypeSize(Type::getInt8PtrTy(C))));
		DtorBuilder.CreateCall(UnregFatbin, BinaryHandle);
		DtorBuilder.CreateRetVoid();

		// Add this function to constructors.
		appendToGlobalCtors(M, CtorFunc, /Priority/ 1);
		}

} // namespace		} // namespace

Error wrapOpenMPBinaries(Module &M, ArrayRef<ArrayRef<char>> Images) {		Error wrapOpenMPBinaries(Module &M, ArrayRef<ArrayRef<char>> Images) {
GlobalVariable *Desc = createBinDesc(M, Images);		GlobalVariable *Desc = createBinDesc(M, Images);
if (!Desc)		if (!Desc)
return createStringError(inconvertibleErrorCode(),		return createStringError(inconvertibleErrorCode(),
"No binary descriptors created.");		"No binary descriptors created.");
createRegisterFunction(M, Desc);		createRegisterFunction(M, Desc);
createUnregisterFunction(M, Desc);		createUnregisterFunction(M, Desc);
return Error::success();		return Error::success();
}		}

llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef<char> Images) {		Error wrapCudaBinary(Module &M, ArrayRef<char> Image) {
// TODO: Implement this.		GlobalVariable *Desc = createFatbinDesc(M, Image);
		if (!Desc)
		return createStringError(inconvertibleErrorCode(),
		"No fatinbary section created.");

		createRegisterFatbinFunction(M, Desc);
return Error::success();		return Error::success();
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[CUDA] Add wrapper code generation for registering CUDA images
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 428622

clang/test/Driver/linker-wrapper-image.c

clang/test/Driver/linker-wrapper.c

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

clang/tools/clang-linker-wrapper/OffloadWrapper.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[CUDA] Add wrapper code generation for registering CUDA imagesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 428622

clang/test/Driver/linker-wrapper-image.c

clang/test/Driver/linker-wrapper.c

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

clang/tools/clang-linker-wrapper/OffloadWrapper.cpp

[CUDA] Add wrapper code generation for registering CUDA images
ClosedPublic