Index: tools/CMakeLists.txt =================================================================== --- tools/CMakeLists.txt +++ tools/CMakeLists.txt @@ -15,6 +15,8 @@ add_subdirectory(clang-check) endif() +add_subdirectory(ptxwrap) + # We support checking out the clang-tools-extra repository into the 'extra' # subdirectory. It contains tools developed as part of the Clang/LLVM project # on top of the Clang tooling platform. We keep them in a separate repository Index: tools/ptxwrap/CMakeLists.txt =================================================================== --- /dev/null +++ tools/ptxwrap/CMakeLists.txt @@ -0,0 +1,16 @@ +set(LLVM_LINK_COMPONENTS support) + +add_clang_executable(clang-ptxwrap + ptxwrap_main.cpp + PtxWrap.cpp + ) + +set(CLANG_FORMAT_LIB_DEPS + clangBasic + ) + +target_link_libraries(clang-ptxwrap + ${CLANG_FORMAT_LIB_DEPS} + ) + +install(TARGETS clang-ptxwrap RUNTIME DESTINATION bin) Index: tools/ptxwrap/PtxWrap.h =================================================================== --- /dev/null +++ tools/ptxwrap/PtxWrap.h @@ -0,0 +1,118 @@ +#ifndef __PTXWRAP_H__ +#define __PTXWRAP_H__ + +#include "llvm/ADT/StringSet.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/MemoryBuffer.h" + +enum WrapMode { GenFatbin, GenStub }; + +const uint32_t FatBinMagicValue = 0xba55ed50; +const uint16_t FatBinVersionValue = 1; +const uint16_t FatBinFileAlignment = 8; + +struct FatBinHeader { + uint32_t Magic; // 0x00 + uint16_t Version; // 0x04 + uint16_t HeaderSize; // 0x06 + uint32_t DataSize; // 0x08 + uint32_t _unused; // 0x0c +public: + FatBinHeader() + : Magic(FatBinMagicValue), Version(FatBinVersionValue), + HeaderSize(sizeof(*this)), DataSize(0), _unused(0) {} +}; + +enum FatBinFileKind { FatBinFilePtx = 1 }; +enum FatBinFlags { + AddressSize64 = 0x01, + HasDebugInfo = 0x02, + ProducerCuda = 0x04, + ProducerOpenCL = 0x08, + HostLinux = 0x10, + HostMac = 0x20, + HostWindows = 0x40, + Compressed = 0x200 +}; + +struct FatBinFileHeader { + uint16_t Kind; // 0x00 + uint16_t unknown02; // 0x02 + uint32_t HeaderSize; // 0x04 + uint32_t DataSize; // 0x08 + uint32_t unknown0c; // 0x0c + uint32_t CompressedSize; // 0x10 + uint32_t SubHeaderSize; // 0x14 + uint16_t VersionMinor; // 0x18 + uint16_t VersionMajor; // 0x1a + uint32_t CudaArch; // 0x1c + uint32_t unknown20; // 0x20 + uint32_t unknown24; // 0x24 + uint32_t Flags; // 0x28 + uint32_t unknown2c; // 0x2c + uint32_t unknown30; // 0x30 + uint32_t unknown34; // 0x34 + uint32_t UncompressedSize; // 0x38 + uint32_t unknown3c; // 0x3c + uint32_t unknown40; // 0x40 + uint32_t unknown44; // 0x44 + FatBinFileHeader(uint32_t _DataSize, uint32_t _Flags) + : Kind(FatBinFilePtx), unknown02(0x0101), HeaderSize(sizeof(*this)), + DataSize(_DataSize), unknown0c(0), CompressedSize(0), + SubHeaderSize(HeaderSize - 8), VersionMinor(2), VersionMajor(4), + CudaArch(35), unknown20(0), unknown24(0), Flags(_Flags), unknown2c(0), + unknown30(0), unknown34(0), UncompressedSize(0), unknown3c(0), + unknown40(0), unknown44(0) {} +}; + +class PtxBlob { + std::unique_ptr PtxBuf; + llvm::StringRef PtxText; + llvm::StringSet<> KnownKernels; + +public: + PtxBlob(std::unique_ptr Ptx) + : PtxBuf(std::move(Ptx)), PtxText(PtxBuf->getBuffer()) { + FindKernels(); + } + const llvm::StringRef getPtxText() const { return PtxBuf->getBuffer(); } + void getKnownKernels(llvm::StringSet<> &Names); + uint32_t getFlags() const { return AddressSize64 | ProducerCuda | HostLinux; } + +private: + void FindKernels(); +}; + +class PtxWrapper { + WrapMode Mode; + const llvm::StringRef OutputFileName; + llvm::SmallVector InputPtx; + llvm::StringSet<> KnownKernels; + std::unique_ptr OS; + +public: + PtxWrapper(WrapMode _Mode, llvm::StringRef _OutputFileName) + : Mode(_Mode), OutputFileName(_OutputFileName), OS(nullptr) { + Init(); + } + + ~PtxWrapper(){}; + + // Processes given PTX file. + bool Wrap(llvm::StringRef FileName); + bool Write(); + +private: + void Init(); + void AddPtxToFatbin(std::unique_ptr PtxBuf); + std::string CreateFatbin(); + void WritePrologue(); + void WriteFatBinArray(const std::string &FatBinString); + void WriteRegistrationCode(); + void WriteEpilogue(); + void CreateFatBinHeader(llvm::raw_string_ostream &FatBinStream, + size_t FatBinDataSize); + std::string BuildFatBinPtx(const PtxBlob &Ptx); + void UpdateFatBinHeader(llvm::raw_string_ostream &FatBinStream); +}; +#endif Index: tools/ptxwrap/PtxWrap.cpp =================================================================== --- /dev/null +++ tools/ptxwrap/PtxWrap.cpp @@ -0,0 +1,181 @@ +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" +#include "PtxWrap.h" + +using namespace llvm; + +void PtxBlob::FindKernels() { + size_t EntryStart = 0; + size_t EntryEnd = 0; + std::pair Pair; + const StringRef EntryKw = ".entry"; + // Look for all ".entry KERNEL_NAME (" + while (true) { + EntryStart = PtxText.find(EntryKw, EntryEnd); + if (EntryStart == StringRef::npos) + break; + EntryEnd = PtxText.find('(', EntryStart); + if (EntryStart == StringRef::npos) + break; + StringRef KernelName = + PtxText.slice(EntryStart + EntryKw.size(), EntryEnd).trim(); + KnownKernels.insert(KernelName); + } +} + +void PtxBlob::getKnownKernels(StringSet<> &Names) { + for (const auto &Kernel : KnownKernels) { + Names.insert(Kernel.first()); + } +} + +void PtxWrapper::CreateFatBinHeader(raw_string_ostream &FatBinStream, + size_t FatBinDataSize) { + FatBinHeader hdr; + hdr.DataSize = FatBinDataSize; + FatBinStream.write(reinterpret_cast(&hdr), sizeof(hdr)); +} + +std::string PtxWrapper::BuildFatBinPtx(const PtxBlob &Ptx) { + std::string PtxString; + raw_string_ostream PtxStream(PtxString); + size_t DataSize = Ptx.getPtxText().size() + + (FatBinFileAlignment & ~(FatBinFileAlignment - 1)); + FatBinFileHeader Header(DataSize, Ptx.getFlags()); + + PtxStream.write(reinterpret_cast(&Header), sizeof(Header)); + + PtxStream << Ptx.getPtxText(); + // Pad end of file with spaces up to required alignment. + while (PtxStream.tell() % FatBinFileAlignment) { + PtxStream << ' '; + } + return PtxStream.str(); +} + +void PtxWrapper::WritePrologue() { + *OS << R"XX( +#include "fatBinaryCtl.h" +#include +#define __CUDA_INTERNAL_COMPILATION__ +#include + +extern "C" { + +__attribute__((constructor)) +static void __load_ptx(void) { +)XX"; +} + +void PtxWrapper::WriteFatBinArray(const std::string &FatBinString) { + *OS << "__attribute__ ((section (\".nv_fatbin\"))) \n" + << "static const char __fatbin_array[] = \n\""; + OS->write_escaped(FatBinString); + *OS << "\";\n"; + *OS << R"XX( + __attribute__ ((aligned (8))) + __attribute__ ((section (".nvFatBinSegment"))) + static const __fatBinC_Wrapper_t __fatbin_wrapper = + {0x466243b1, 1, (const unsigned long long*)__fatbin_array, 0}; + __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatbin_wrapper); + { + volatile static void **__ref __attribute__((unused)); + __ref = (volatile void **)__cudaFatCubinHandle; + }; +)XX"; +} + +void PtxWrapper::WriteRegistrationCode() { + llvm::StringSet<> KernelNames; + for (auto &PtxBlob : InputPtx) { + PtxBlob.getKnownKernels(KernelNames); + } + + int kernel_count = 0; + for (const auto &X : KernelNames) { + *OS << "extern void __kernel_launch_func" << kernel_count << "(void) asm(\"" + << X.first() << "\");\n"; + *OS << "static char __kernel_name" << kernel_count << "[] = \"" << X.first() + << "\";\n"; + *OS << " __cudaRegisterFunction(__cudaFatCubinHandle, " + << "(const char *)__kernel_launch_func" << kernel_count + << ", __kernel_name" << kernel_count << ", \"" << X.first() << "\", " + << "-1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0);\n"; + ++kernel_count; + } +} + +void PtxWrapper::WriteEpilogue() { + *OS << R"XX( + atexit(__cudaUnregisterBinaryUtil); +} + +} // extern "C" +)XX"; +} + +std::string PtxWrapper::CreateFatbin() { + std::string FatBinString; + raw_string_ostream FatBinStream(FatBinString); + llvm::SmallVector FatBinPieces; + size_t FatBinDataSize = 0; + + // Collect fatbin parts for each PTX blob so we know total size + for (auto &PtxBlob : InputPtx) { + FatBinPieces.push_back(BuildFatBinPtx(PtxBlob)); + llvm::errs() << "PTX size " << FatBinPieces.back().size() << "\n"; + FatBinDataSize += FatBinPieces.back().size(); + } + CreateFatBinHeader(FatBinStream, FatBinDataSize); + for (std::string &FatBinPiece : FatBinPieces) { + FatBinStream << FatBinPiece; + } + return FatBinString; +} + +void PtxWrapper::AddPtxToFatbin(std::unique_ptr PtxBuf) { + InputPtx.push_back(PtxBlob(std::move(PtxBuf))); + InputPtx.back().getKnownKernels(KnownKernels); +} + +bool PtxWrapper::Wrap(StringRef FileName) { + ErrorOr> CodeOrErr = + MemoryBuffer::getFileOrSTDIN(FileName); + if (std::error_code EC = CodeOrErr.getError()) { + llvm::errs() << EC.message() << "\n"; + return true; + } + + AddPtxToFatbin(std::move(CodeOrErr.get())); + return false; +} + +bool PtxWrapper::Write() { + std::string FatBinString; + FatBinString = CreateFatbin(); + + if (Mode == GenFatbin) { + *OS << FatBinString; + } else { + WritePrologue(); + WriteFatBinArray(FatBinString); + WriteRegistrationCode(); + WriteEpilogue(); + } + OS->flush(); + return false; +} + +void PtxWrapper::Init() { + std::error_code EC; + std::unique_ptr OutFile( + new raw_fd_ostream(OutputFileName, EC, sys::fs::F_RW)); + if (EC) { + errs() << "Error opening '" << OutputFileName << "': " << EC.message() + << '\n'; + exit(1); + } + OS = std::move(OutFile); +} Index: tools/ptxwrap/ptxwrap_main.cpp =================================================================== --- /dev/null +++ tools/ptxwrap/ptxwrap_main.cpp @@ -0,0 +1,52 @@ +#include "clang/Basic/Version.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Signals.h" +#include "PtxWrap.h" + +using namespace llvm; + +static cl::opt Help("h", cl::desc("Alias for -help"), cl::Hidden); +static cl::opt OutputFilename("o", + cl::desc("Specify output filename"), + cl::value_desc("filename"), + cl::Required); +static cl::list InputFilenames(cl::desc(" [ ...]"), + cl::Positional, cl::Required, + cl::OneOrMore); +cl::opt Mode( + cl::desc("Choose wrap mode:"), + cl::values(clEnumValN(GenFatbin, "fatbin", "Produce fatbin file."), + clEnumValN(GenStub, "stub", "Produce host-side source code."), + clEnumValEnd)); + +static void PrintVersion() { + raw_ostream &OS = outs(); + OS << clang::getClangToolFullVersion("clang-ptxwrap") << '\n'; +} + +int main(int argc, char *argv[]) { + llvm::sys::PrintStackTraceOnErrorSignal(); + + Mode = GenStub; + + cl::SetVersionPrinter(PrintVersion); + cl::ParseCommandLineOptions( + argc, argv, "A tool to generate wrapper code for PTX assembly which\n" + "would produce includable C++ code to register kernels\n" + "with CUDA runtime."); + if (Help) + cl::PrintHelpMessage(); + + PtxWrapper Wrapper(Mode, OutputFilename); + bool Error = false; + for (const auto &Filename : InputFilenames) + Error |= Wrapper.Wrap(Filename); + + if (!Error) + Error = Wrapper.Write(); + + return Error; +}