diff --git a/clang/include/clang/Interpreter/Interpreter.h b/clang/include/clang/Interpreter/Interpreter.h --- a/clang/include/clang/Interpreter/Interpreter.h +++ b/clang/include/clang/Interpreter/Interpreter.h @@ -42,6 +42,16 @@ public: static llvm::Expected> create(std::vector &ClangArgv); + static llvm::Expected> + createCpp(std::vector &ClangArgv); +}; + +class IncrementalCudaCompilerBuilder { +public: + static llvm::Expected> + createHost(std::vector &ClangArgv, llvm::StringRef FatbinFile); + static llvm::Expected> + createDevice(std::vector &ClangArgv); }; /// Provides top-level interfaces for incremental compilation and execution. @@ -50,6 +60,9 @@ std::unique_ptr IncrParser; std::unique_ptr IncrExecutor; + // An optional parser for CUDA offloading + std::unique_ptr DeviceParser; + Interpreter(std::unique_ptr CI, llvm::Error &Err); llvm::Error CreateExecutor(); @@ -58,6 +71,11 @@ ~Interpreter(); static llvm::Expected> create(std::unique_ptr CI); + static llvm::Expected> + createWithCUDA(std::unique_ptr CI, + std::unique_ptr DCI, + llvm::StringRef OffloadArch, + llvm::StringRef TempDeviceCodeFilename); const CompilerInstance *getCompilerInstance() const; llvm::Expected getExecutionEngine(); diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp --- a/clang/lib/CodeGen/CodeGenAction.cpp +++ b/clang/lib/CodeGen/CodeGenAction.cpp @@ -264,6 +264,7 @@ // Links each entry in LinkModules into our module. Returns true on error. bool LinkInModules() { for (auto &LM : LinkModules) { + assert(LM.Module && "LinkModule does not actually have a module"); if (LM.PropagateAttrs) for (Function &F : *LM.Module) { // Skip intrinsics. Keep consistent with how intrinsics are created @@ -292,6 +293,7 @@ if (Err) return true; } + LinkModules.clear(); return false; // success } diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -6253,6 +6253,10 @@ } void CodeGenModule::EmitTopLevelStmt(const TopLevelStmtDecl *D) { + // Device code should not be at top level. + if (LangOpts.CUDA && LangOpts.CUDAIsDevice) + return; + std::unique_ptr &CurCGF = GlobalTopLevelStmtBlockInFlight.first; diff --git a/clang/lib/Interpreter/CMakeLists.txt b/clang/lib/Interpreter/CMakeLists.txt --- a/clang/lib/Interpreter/CMakeLists.txt +++ b/clang/lib/Interpreter/CMakeLists.txt @@ -1,6 +1,7 @@ set(LLVM_LINK_COMPONENTS core native + MC Option OrcJit Support @@ -12,6 +13,7 @@ IncrementalExecutor.cpp IncrementalParser.cpp Interpreter.cpp + Offload.cpp DEPENDS intrinsics_gen diff --git a/clang/lib/Interpreter/IncrementalParser.h b/clang/lib/Interpreter/IncrementalParser.h --- a/clang/lib/Interpreter/IncrementalParser.h +++ b/clang/lib/Interpreter/IncrementalParser.h @@ -37,6 +37,7 @@ /// changes between the subsequent incremental input. /// class IncrementalParser { +protected: /// Long-lived, incremental parsing action. std::unique_ptr Act; @@ -56,17 +57,19 @@ /// of code. std::list PTUs; + IncrementalParser(); + public: IncrementalParser(std::unique_ptr Instance, llvm::LLVMContext &LLVMCtx, llvm::Error &Err); - ~IncrementalParser(); + virtual ~IncrementalParser(); const CompilerInstance *getCI() const { return CI.get(); } /// Parses incremental input by creating an in-memory file. ///\returns a \c PartialTranslationUnit which holds information about the /// \c TranslationUnitDecl and \c llvm::Module corresponding to the input. - llvm::Expected Parse(llvm::StringRef Input); + virtual llvm::Expected Parse(llvm::StringRef Input); /// Uses the CodeGenModule mangled name cache and avoids recomputing. ///\returns the mangled name of a \c GD. diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp --- a/clang/lib/Interpreter/IncrementalParser.cpp +++ b/clang/lib/Interpreter/IncrementalParser.cpp @@ -122,6 +122,10 @@ } }; +static CodeGenerator *getCodeGen(FrontendAction *Act); + +IncrementalParser::IncrementalParser() {} + IncrementalParser::IncrementalParser(std::unique_ptr Instance, llvm::LLVMContext &LLVMCtx, llvm::Error &Err) @@ -135,6 +139,21 @@ P.reset( new Parser(CI->getPreprocessor(), CI->getSema(), /*SkipBodies=*/false)); P->Initialize(); + + // An initial PTU is needed as CUDA includes some headers automatically + auto PTU = ParseOrWrapTopLevelDecl(); + if (auto E = PTU.takeError()) { + consumeError(std::move(E)); // FIXME + return; // PTU.takeError(); + } + + if (CodeGenerator *CG = getCodeGen(Act.get())) { + std::unique_ptr M(CG->ReleaseModule()); + CG->StartModule("incr_module_" + std::to_string(PTUs.size()), + M->getContext()); + PTU->TheModule = std::move(M); + assert(PTU->TheModule && "Failed to create initial PTU"); + } } IncrementalParser::~IncrementalParser() { diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp --- a/clang/lib/Interpreter/Interpreter.cpp +++ b/clang/lib/Interpreter/Interpreter.cpp @@ -15,6 +15,7 @@ #include "IncrementalExecutor.h" #include "IncrementalParser.h" +#include "Offload.h" #include "clang/AST/ASTContext.h" #include "clang/Basic/TargetInfo.h" @@ -139,7 +140,6 @@ // action and use other actions in incremental mode. // FIXME: Print proper driver diagnostics if the driver flags are wrong. // We do C++ by default; append right after argv[0] if no "-x" given - ClangArgv.insert(ClangArgv.end(), "-xc++"); ClangArgv.insert(ClangArgv.end(), "-Xclang"); ClangArgv.insert(ClangArgv.end(), "-fincremental-extensions"); ClangArgv.insert(ClangArgv.end(), "-c"); @@ -172,6 +172,39 @@ return CreateCI(**ErrOrCC1Args); } +llvm::Expected> +IncrementalCompilerBuilder::createCpp(std::vector &ClangArgv) { + ClangArgv.insert(ClangArgv.begin(), "-xc++"); + + return IncrementalCompilerBuilder::create(ClangArgv); +} + +llvm::Expected> +IncrementalCudaCompilerBuilder::createDevice( + std::vector &ClangArgv) { + ClangArgv.insert(ClangArgv.begin(), "-xcuda"); + ClangArgv.insert(ClangArgv.begin(), "--cuda-device-only"); + + auto CI = IncrementalCompilerBuilder::create(ClangArgv); + assert(!CI.takeError()); + return CI; +} + +llvm::Expected> +IncrementalCudaCompilerBuilder::createHost(std::vector &ClangArgv, + llvm::StringRef FatbinFile) { + ClangArgv.insert(ClangArgv.begin(), "-xcuda"); + ClangArgv.insert(ClangArgv.begin(), "--cuda-host-only"); + + auto CI = IncrementalCompilerBuilder::create(ClangArgv); + if (!CI) + return CI.takeError(); + + (*CI)->getCodeGenOpts().CudaGpuBinaryFileName = FatbinFile; + + return CI; +} + Interpreter::Interpreter(std::unique_ptr CI, llvm::Error &Err) { llvm::ErrorAsOutParameter EAO(&Err); @@ -200,6 +233,25 @@ return std::move(Interp); } +llvm::Expected> Interpreter::createWithCUDA( + std::unique_ptr CI, std::unique_ptr DCI, + llvm::StringRef CudaArch, llvm::StringRef TempDeviceCodeFilename) { + auto Interp = Interpreter::create(std::move(CI)); + if (auto E = Interp.takeError()) + return E; + + llvm::Error Err = llvm::Error::success(); + auto DeviceParser = std::make_unique( + std::move(DCI), *(*Interp)->TSCtx->getContext(), CudaArch, + TempDeviceCodeFilename, Err); + if (Err) + return std::move(Err); + + (*Interp)->DeviceParser = std::move(DeviceParser); + + return Interp; +} + const CompilerInstance *Interpreter::getCompilerInstance() const { return IncrParser->getCI(); } @@ -215,6 +267,11 @@ llvm::Expected Interpreter::Parse(llvm::StringRef Code) { + if (DeviceParser) { + auto DevicePTU = DeviceParser->Parse(Code); + if (auto E = DevicePTU.takeError()) + return E; + } return IncrParser->Parse(Code); } diff --git a/clang/lib/Interpreter/Offload.h b/clang/lib/Interpreter/Offload.h new file mode 100644 --- /dev/null +++ b/clang/lib/Interpreter/Offload.h @@ -0,0 +1,47 @@ +//===--------------- Offload.h - CUDA Offloading ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements classes required for offloading to CUDA devices. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_INTERPRETER_OFFLOAD_H +#define LLVM_CLANG_LIB_INTERPRETER_OFFLOAD_H + +#include "IncrementalParser.h" + +namespace clang { + +class DeviceCodeInlinerAction; + +class IncrementalCUDADeviceParser : public IncrementalParser { +public: + IncrementalCUDADeviceParser(std::unique_ptr Instance, + llvm::LLVMContext &LLVMCtx, llvm::StringRef Arch, + llvm::StringRef FatbinFile, llvm::Error &Err); + + llvm::Expected + Parse(llvm::StringRef Input) override; + + // Generate PTX for the last PTU + llvm::Expected GeneratePTX(); + + // Write last PTX to the fatbinary file + llvm::Error WriteFatbinary() const; + + ~IncrementalCUDADeviceParser(); + +protected: + int SMVersion; + std::string FatbinFilePath; + llvm::SmallString<1024> PTXCode; +}; + +} // namespace clang + +#endif // LLVM_CLANG_LIB_INTERPRETER_OFFLOAD_H diff --git a/clang/lib/Interpreter/Offload.cpp b/clang/lib/Interpreter/Offload.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Interpreter/Offload.cpp @@ -0,0 +1,166 @@ +//===-------------- Offload.cpp - CUDA Offloading ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements offloading to CUDA devices. +// +//===----------------------------------------------------------------------===// + +#include "Offload.h" + +#include "clang/Frontend/CompilerInstance.h" + +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" + +namespace clang { + +IncrementalCUDADeviceParser::IncrementalCUDADeviceParser( + std::unique_ptr Instance, llvm::LLVMContext &LLVMCtx, + llvm::StringRef Arch, llvm::StringRef FatbinFile, llvm::Error &Err) + : IncrementalParser(std::move(Instance), LLVMCtx, Err) { + if (Err) + return; + + if (!Arch.starts_with("sm_") || Arch.substr(3).getAsInteger(10, SMVersion)) { + llvm::errs() << Arch.substr(3) << SMVersion << '\n'; + + Err = llvm::joinErrors(std::move(Err), llvm::make_error( + "Invalid CUDA architecture", + llvm::inconvertibleErrorCode())); + return; + } + + FatbinFilePath = FatbinFile.str(); +} + +llvm::Expected +IncrementalCUDADeviceParser::Parse(llvm::StringRef Input) { + auto PTU = IncrementalParser::Parse(Input); + if (!PTU) + return PTU.takeError(); + + auto PTX = GeneratePTX(); + if (!PTX) + return PTX.takeError(); + + auto Err = WriteFatbinary(); + if (Err) + return Err; + + return PTU; +} + +llvm::Expected IncrementalCUDADeviceParser::GeneratePTX() { + auto &PTU = PTUs.back(); + std::string Error; + const llvm::Target *Target = llvm::TargetRegistry::lookupTarget( + PTU.TheModule->getTargetTriple(), Error); + if (!Target) + return llvm::make_error(std::move(Error), + std::error_code()); + + llvm::TargetOptions TO = llvm::TargetOptions(); + llvm::TargetMachine *TargetMachine = Target->createTargetMachine( + PTU.TheModule->getTargetTriple(), "sm_" + llvm::itostr(SMVersion), "", TO, + llvm::Reloc::Model::PIC_); + PTU.TheModule->setDataLayout(TargetMachine->createDataLayout()); + + PTXCode.clear(); + llvm::raw_svector_ostream dest(PTXCode); + + llvm::legacy::PassManager PM; + if (TargetMachine->addPassesToEmitFile(PM, dest, nullptr, + llvm::CGFT_AssemblyFile)) { + return llvm::make_error( + "NVPTX backend cannot produce PTX code.", + llvm::inconvertibleErrorCode()); + } + + if (!PM.run(*PTU.TheModule)) + return llvm::make_error("Failed to emit PTX code.", + llvm::inconvertibleErrorCode()); + + PTXCode += '\0'; + while (PTXCode.size() % 8) + PTXCode += '\0'; + return PTXCode.str(); +} + +llvm::Error IncrementalCUDADeviceParser::WriteFatbinary() const { + enum FatBinFlags { + AddressSize64 = 0x01, + HasDebugInfo = 0x02, + ProducerCuda = 0x04, + HostLinux = 0x10, + HostMac = 0x20, + HostWindows = 0x40 + }; + + struct FatBinInnerHeader { + uint16_t Kind; // 0x00 + uint16_t unknown02; // 0x02 + uint32_t HeaderSize; // 0x04 + uint32_t DataSize; // 0x08 + uint32_t unknown0c; // 0x0c + uint32_t CompressedSize; // 0x10 + uint32_t SubHeaderSize; // 0x14 + uint16_t VersionMinor; // 0x18 + uint16_t VersionMajor; // 0x1a + uint32_t CudaArch; // 0x1c + uint32_t unknown20; // 0x20 + uint32_t unknown24; // 0x24 + uint32_t Flags; // 0x28 + uint32_t unknown2c; // 0x2c + uint32_t unknown30; // 0x30 + uint32_t unknown34; // 0x34 + uint32_t UncompressedSize; // 0x38 + uint32_t unknown3c; // 0x3c + uint32_t unknown40; // 0x40 + uint32_t unknown44; // 0x44 + FatBinInnerHeader(uint32_t DataSize, uint32_t CudaArch, uint32_t Flags) + : Kind(1 /*PTX*/), unknown02(0x0101), HeaderSize(sizeof(*this)), + DataSize(DataSize), unknown0c(0), CompressedSize(0), + SubHeaderSize(HeaderSize - 8), VersionMinor(2), VersionMajor(4), + CudaArch(CudaArch), unknown20(0), unknown24(0), Flags(Flags), + unknown2c(0), unknown30(0), unknown34(0), UncompressedSize(0), + unknown3c(0), unknown40(0), unknown44(0) {} + }; + + struct FatBinHeader { + uint32_t Magic; // 0x00 + uint16_t Version; // 0x04 + uint16_t HeaderSize; // 0x06 + uint32_t DataSize; // 0x08 + uint32_t unknown0c; // 0x0c + public: + FatBinHeader(uint32_t DataSize) + : Magic(0xba55ed50), Version(1), HeaderSize(sizeof(*this)), + DataSize(DataSize), unknown0c(0) {} + }; + + std::error_code EC; + llvm::raw_fd_ostream os(FatbinFilePath.c_str(), EC, llvm::sys::fs::OF_None); + if (EC) { + return llvm::errorCodeToError(EC); + } + + FatBinHeader FatbinOuterHeader(sizeof(FatBinInnerHeader) + PTXCode.size()); + os.write((const char *)&FatbinOuterHeader, FatbinOuterHeader.HeaderSize); + + FatBinInnerHeader InnerHeader( + PTXCode.size(), 30, FatBinFlags::AddressSize64 | FatBinFlags::HostLinux); + os.write((const char *)&InnerHeader, InnerHeader.HeaderSize); + os << PTXCode; + + return llvm::Error::success(); +} + +IncrementalCUDADeviceParser::~IncrementalCUDADeviceParser() {} + +} // namespace clang diff --git a/clang/tools/clang-repl/ClangRepl.cpp b/clang/tools/clang-repl/ClangRepl.cpp --- a/clang/tools/clang-repl/ClangRepl.cpp +++ b/clang/tools/clang-repl/ClangRepl.cpp @@ -23,6 +23,9 @@ #include "llvm/Support/TargetSelect.h" // llvm::Initialize* #include +static llvm::cl::opt CudaEnabled("cuda", llvm::cl::Hidden); +static llvm::cl::opt OffloadArch("offload-arch", llvm::cl::Hidden); + static llvm::cl::list ClangArgs("Xcc", llvm::cl::desc("Argument to pass to the CompilerInvocation"), @@ -90,9 +93,29 @@ return 0; } + std::unique_ptr DeviceCI; + if (CudaEnabled) { + // initialize NVPTX backend + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); + + auto DeviceArgv = ClangArgv; + + DeviceCI = ExitOnErr( + clang::IncrementalCudaCompilerBuilder::createDevice(DeviceArgv)); + } + // FIXME: Investigate if we could use runToolOnCodeWithArgs from tooling. It // can replace the boilerplate code for creation of the compiler instance. - auto CI = ExitOnErr(clang::IncrementalCompilerBuilder::create(ClangArgv)); + std::unique_ptr CI; + if (CudaEnabled) { + CI = ExitOnErr(clang::IncrementalCudaCompilerBuilder::createHost( + ClangArgv, "/tmp/clang-repl.fatbin")); + } else { + CI = ExitOnErr(clang::IncrementalCompilerBuilder::createCpp(ClangArgv)); + } // Set an error handler, so that any LLVM backend diagnostics go through our // error handler. @@ -102,7 +125,19 @@ // Load any requested plugins. CI->LoadRequestedPlugins(); - auto Interp = ExitOnErr(clang::Interpreter::create(std::move(CI))); + std::unique_ptr Interp; + if (CudaEnabled) { + if (OffloadArch.empty()) { + OffloadArch = "sm_35"; + } + Interp = ExitOnErr(clang::Interpreter::createWithCUDA( + std::move(CI), std::move(DeviceCI), OffloadArch, + "/tmp/clang-repl.fatbin")); + + ExitOnErr(Interp->LoadDynamicLibrary("libcudart.so")); + } else + Interp = ExitOnErr(clang::Interpreter::create(std::move(CI))); + for (const std::string &input : OptInputs) { if (auto Err = Interp->ParseAndExecute(input)) llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), "error: ");