This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
clang/
-
test/Driver/
-
Driver/
1/1
linker-wrapper.c
-
tools/clang-linker-wrapper/
-
clang-linker-wrapper/
-
ClangLinkerWrapper.cpp

Differential D124292

[OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility
ClosedPublic

Authored by jhuber6 on Apr 22 2022, 12:34 PM.

Download Raw Diff

Details

Reviewers

jdoerfert
tianshilei1992
tra
JonChesterfield

Commits

rG3530c35c6609: [OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility

Summary

When we do LTO we consider ourselves to have whole program visibility if
every single input file we have contains LLVM bitcode. If we have whole
program visibliity then we can create a single image and utilize CUDA's
non-RDC mode by not passing -c to ptxas and ignoring the nvlink
job. This should be faster for some situations and also saves us the
time executing nvlink.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

jhuber6 created this revision.Apr 22 2022, 12:34 PM

Herald added a project: Restricted Project. · View Herald TranscriptApr 22 2022, 12:34 PM

Herald added subscribers: mattd, guansong, inglorion, yaxunl. · View Herald Transcript

jhuber6 requested review of this revision.Apr 22 2022, 12:34 PM

Herald added a project: Restricted Project. · View Herald TranscriptApr 22 2022, 12:34 PM

Herald added subscribers: cfe-commits, sstefan1. · View Herald Transcript

LGTM with a minor test nit.

clang/test/Driver/linker-wrapper.c
41–42	// LTO-NOT: nvlink

This revision is now accepted and ready to land.Apr 22 2022, 12:49 PM

jhuber6 marked an inline comment as done.Apr 22 2022, 12:50 PM

Add test line

Harbormaster completed remote builds in B160937: Diff 424587.Apr 22 2022, 1:38 PM

This revision was landed with ongoing or failed builds.Apr 23 2022, 9:43 AM

Closed by commit rG3530c35c6609: [OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility (authored by jhuber6). · Explain Why

This revision was automatically updated to reflect the committed changes.

jhuber6 added a commit: rG3530c35c6609: [OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility.

Revision Contents

Path

Size

clang/

test/

Driver/

linker-wrapper.c

4 lines

tools/

clang-linker-wrapper/

ClangLinkerWrapper.cpp

25 lines

Diff 424587

clang/test/Driver/linker-wrapper.c

	Show All 32 Lines
	// HOST_LINK: ld.lld{{.}}-a -b -c {{.}}.o -o a.out			// HOST_LINK: ld.lld{{.}}-a -b -c {{.}}.o -o a.out

	// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \			// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
	// RUN: -fembed-offload-object=%S/Inputs/dummy-bc.bc,openmp,nvptx64-nvida-cuda,sm_70 \			// RUN: -fembed-offload-object=%S/Inputs/dummy-bc.bc,openmp,nvptx64-nvida-cuda,sm_70 \
	// RUN: -fembed-offload-object=%S/Inputs/dummy-bc.bc,openmp,nvptx64-nvida-cuda,sm_70			// RUN: -fembed-offload-object=%S/Inputs/dummy-bc.bc,openmp,nvptx64-nvida-cuda,sm_70
	// RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \			// RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \
	// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 \| FileCheck %s --check-prefix=LTO			// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 \| FileCheck %s --check-prefix=LTO

	// LTO: ptxas{{.}}-m64 -o {{.}}.cubin -O2 --gpu-name sm_70 -c {{.*}}.s			// LTO: ptxas{{.}}-m64 -o {{.}}.cubin -O2 --gpu-name sm_70 {{.*}}.s
	// LTO: nvlink{{.}}-m64 -o {{.}}.out -arch sm_70 {{.*}}.cubin			// LTO-NOT: nvlink
				traUnsubmitted Done Reply Inline Actions // LTO-NOT: nvlink tra: // LTO-NOT: nvlink

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Show First 20 Lines • Show All 589 Lines • ▼ Show 20 Lines	extractFromBuffer(std::unique_ptr<MemoryBuffer> Buffer,
default:		default:
return None;		return None;
}		}
}		}

// TODO: Move these to a separate file.		// TODO: Move these to a separate file.
namespace nvptx {		namespace nvptx {
Expected<std::string> assemble(StringRef InputFile, Triple TheTriple,		Expected<std::string> assemble(StringRef InputFile, Triple TheTriple,
StringRef Arch) {		StringRef Arch, bool RDC = true) {
// NVPTX uses the ptxas binary to create device object files.		// NVPTX uses the ptxas binary to create device object files.
Expected<std::string> PtxasPath = findProgram("ptxas", {CudaBinaryPath});		Expected<std::string> PtxasPath = findProgram("ptxas", {CudaBinaryPath});
if (!PtxasPath)		if (!PtxasPath)
return PtxasPath.takeError();		return PtxasPath.takeError();

// Create a new file to write the linked device image to.		// Create a new file to write the linked device image to.
SmallString<128> TempFile;		SmallString<128> TempFile;
if (Error Err =		if (Error Err =
Show All 14 Lines	else if (DebugInfo == FullDebugInfo && OptLevel[1] == '0')
CmdArgs.push_back("-g");		CmdArgs.push_back("-g");
for (auto &Arg : PtxasArgs)		for (auto &Arg : PtxasArgs)
CmdArgs.push_back(Arg);		CmdArgs.push_back(Arg);
CmdArgs.push_back("-o");		CmdArgs.push_back("-o");
CmdArgs.push_back(TempFile);		CmdArgs.push_back(TempFile);
CmdArgs.push_back(Opt);		CmdArgs.push_back(Opt);
CmdArgs.push_back("--gpu-name");		CmdArgs.push_back("--gpu-name");
CmdArgs.push_back(Arch);		CmdArgs.push_back(Arch);
		if (RDC)
CmdArgs.push_back("-c");		CmdArgs.push_back("-c");

CmdArgs.push_back(InputFile);		CmdArgs.push_back(InputFile);

if (Verbose)		if (Verbose)
printCommands(CmdArgs);		printCommands(CmdArgs);

if (Error Err = executeCommands(*PtxasPath, CmdArgs))		if (Error Err = executeCommands(*PtxasPath, CmdArgs))
return std::move(Err);		return std::move(Err);
▲ Show 20 Lines • Show All 290 Lines • ▼ Show 20 Lines
// `__start_` and `__stop_` symbols.		// `__start_` and `__stop_` symbols.
bool isValidCIdentifier(StringRef S) {		bool isValidCIdentifier(StringRef S) {
return !S.empty() && (isAlpha(S[0]) \|\| S[0] == '_') &&		return !S.empty() && (isAlpha(S[0]) \|\| S[0] == '_') &&
std::all_of(S.begin() + 1, S.end(),		std::all_of(S.begin() + 1, S.end(),
[](char C) { return C == '_' \|\| isAlnum(C); });		[](char C) { return C == '_' \|\| isAlnum(C); });
}		}

Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,		Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
const Triple &TheTriple, StringRef Arch) {		const Triple &TheTriple, StringRef Arch,
		bool &WholeProgram) {
SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers;		SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers;
SmallVector<std::unique_ptr<lto::InputFile>, 4> BitcodeFiles;		SmallVector<std::unique_ptr<lto::InputFile>, 4> BitcodeFiles;
SmallVector<std::string, 4> NewInputFiles;		SmallVector<std::string, 4> NewInputFiles;
DenseSet<StringRef> UsedInRegularObj;		DenseSet<StringRef> UsedInRegularObj;
DenseSet<StringRef> UsedInSharedLib;		DenseSet<StringRef> UsedInSharedLib;
BumpPtrAllocator Alloc;		BumpPtrAllocator Alloc;
StringSaver Saver(Alloc);		StringSaver Saver(Alloc);

▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	auto OutputBitcode = [&](size_t Task, const Module &M) {
if (EC)		if (EC)
HandleError(errorCodeToError(EC));		HandleError(errorCodeToError(EC));
WriteBitcodeToFile(M, LinkedBitcode);		WriteBitcodeToFile(M, LinkedBitcode);
NewInputFiles.push_back(static_cast<std::string>(TempFile));		NewInputFiles.push_back(static_cast<std::string>(TempFile));
return false;		return false;
};		};

// We assume visibility of the whole program if every input file was bitcode.		// We assume visibility of the whole program if every input file was bitcode.
bool WholeProgram = BitcodeFiles.size() == InputFiles.size();		WholeProgram = BitcodeFiles.size() == InputFiles.size();
auto LTOBackend =		auto LTOBackend =
(EmbedBitcode) ? createLTO(TheTriple, Arch, WholeProgram, OutputBitcode)		(EmbedBitcode) ? createLTO(TheTriple, Arch, WholeProgram, OutputBitcode)
: createLTO(TheTriple, Arch, WholeProgram);		: createLTO(TheTriple, Arch, WholeProgram);

// We need to resolve the symbols so the LTO backend knows which symbols need		// We need to resolve the symbols so the LTO backend knows which symbols need
// to be kept or can be internalized. This is a simplified symbol resolution		// to be kept or can be internalized. This is a simplified symbol resolution
// scheme to approximate the full resolution a linker would do.		// scheme to approximate the full resolution a linker would do.
DenseSet<StringRef> PrevailingSymbols;		DenseSet<StringRef> PrevailingSymbols;
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines	Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
};		};

if (Error Err = LTOBackend->run(AddStream))		if (Error Err = LTOBackend->run(AddStream))
return Err;		return Err;

// Is we are compiling for NVPTX we need to run the assembler first.		// Is we are compiling for NVPTX we need to run the assembler first.
if (TheTriple.isNVPTX() && !EmbedBitcode) {		if (TheTriple.isNVPTX() && !EmbedBitcode) {
for (auto &File : Files) {		for (auto &File : Files) {
auto FileOrErr = nvptx::assemble(File, TheTriple, Arch);		auto FileOrErr = nvptx::assemble(File, TheTriple, Arch, !WholeProgram);
if (!FileOrErr)		if (!FileOrErr)
return FileOrErr.takeError();		return FileOrErr.takeError();
File = *FileOrErr;		File = *FileOrErr;
}		}
}		}

// Append the new inputs to the device linker input.		// Append the new inputs to the device linker input.
for (auto &File : Files)		for (auto &File : Files)
Show All 11 Lines	Error linkDeviceFiles(ArrayRef<DeviceFile> DeviceFiles,
DenseMap<DeviceFile, SmallVector<std::string, 4>> LinkerInputMap;		DenseMap<DeviceFile, SmallVector<std::string, 4>> LinkerInputMap;
for (auto &File : DeviceFiles)		for (auto &File : DeviceFiles)
LinkerInputMap[File].push_back(File.Filename);		LinkerInputMap[File].push_back(File.Filename);

// Try to link each device toolchain.		// Try to link each device toolchain.
for (auto &LinkerInput : LinkerInputMap) {		for (auto &LinkerInput : LinkerInputMap) {
DeviceFile &File = LinkerInput.getFirst();		DeviceFile &File = LinkerInput.getFirst();
Triple TheTriple = Triple(File.TheTriple);		Triple TheTriple = Triple(File.TheTriple);
		bool WholeProgram = false;

// Run LTO on any bitcode files and replace the input with the result.		// Run LTO on any bitcode files and replace the input with the result.
if (Error Err =		if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple,
linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, File.Arch))		File.Arch, WholeProgram))
return Err;		return Err;

// If we are embedding bitcode for JIT, skip the final device linking.		// If we are embedding bitcode for JIT, skip the final device linking.
if (EmbedBitcode) {		if (EmbedBitcode) {
assert(!LinkerInput.getSecond().empty() && "No bitcode image to embed");		assert(!LinkerInput.getSecond().empty() && "No bitcode image to embed");
LinkedImages.push_back(LinkerInput.getSecond().front());		LinkedImages.push_back(LinkerInput.getSecond().front());
continue;		continue;
}		}

		// If we performed LTO on NVPTX and had whole program visibility, we can use
		// CUDA in non-RDC mode.
		if (WholeProgram && TheTriple.isNVPTX()) {
		assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed");
		LinkedImages.push_back(LinkerInput.getSecond().front());
		continue;
		}

auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch);		auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch);
if (!ImageOrErr)		if (!ImageOrErr)
return ImageOrErr.takeError();		return ImageOrErr.takeError();

LinkedImages.push_back(*ImageOrErr);		LinkedImages.push_back(*ImageOrErr);
}		}
return Error::success();		return Error::success();
}		}
▲ Show 20 Lines • Show All 219 Lines • Show Last 20 Lines