Index: streamexecutor/include/streamexecutor/KernelSpec.h =================================================================== --- /dev/null +++ streamexecutor/include/streamexecutor/KernelSpec.h @@ -0,0 +1,298 @@ +//===-- KernelSpec.h - Kernel loader spec types -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// A KernelLoaderSpec is a class that knows where to find the code for a +/// data-parallel kernel in a single format on a single platform. So, for +/// example, there will be one subclass that deals with CUDA PTX code, another +/// subclass that deals with CUDA cubin code, and yet another subclass that +/// deals with OpenCL text code. +/// +/// A MultiKernelLoaderSpec is basically a collection of KernelLoaderSpec +/// instances. This is useful when code is available for the same kernel in +/// several different formats or targeted for several different platforms. All +/// the various KernelLoaderSpec instances for this kernel can be combined +/// together in one MultiKernelLoaderSpec and the specific platform consumer can +/// decide which instance of the code it wants to use. +/// +/// Rather than instantiating each KernelLoaderSpec separately and then +/// registering it with the MultiKernelLoaderSpec, the MultiKernelLoaderSpec +/// provides several helper functions, such as +/// MultiKernelLoaderSpec::addCUDAPTXInMemory, to construct the KernelLoaderSpec +/// and add it to the MultiKernelLoaderSpec at the same time. +/// +/// These loader spec classes are designed primarily to be instantiated by the +/// compiler, but they can also be instantiated directly by the user. A +/// simplified example workflow which a compiler might follow in the case of a +/// CUDA kernel that is compiled to CUDA cubin code is as follows: +/// +/// 1. The user defines a kernel function called UserKernel. +/// 2. The compiler compiles the kernel code into cubin data and embeds that +/// data into the host code at address __UserKernelCubinAddress. +/// 3. The compiler adds code at the beginning of the host code to instantiate a +/// MultiKernelLoaderSpec: +/// \code +/// namespace compiler_cuda_namespace { +/// MultiKernelLoaderSpec UserKernelLoaderSpec(); +/// } // namespace compiler_cuda_namespace +/// \endcode +/// 4. The compiler then adds code to the host code to add the cubin data to the +/// new MultiKernelLoaderSpec, and to associate that data with the kernel +/// name "UserKernel": +/// \code +/// namespace compiler_cuda_namespace { +/// UserKernelLoaderSpec.addCUDACubinInMemory( +/// __UserKernelCubinAddress, "UserKernel"); +/// } // namespace compiler_cuda_namespace +/// \encode +/// 5. The host code, having known beforehand that the compiler would initialize +/// a MultiKernelLoaderSpec based on the name of the CUDA kernel, makes use +/// of the symbol cudanamespace::UserKernelLoaderSpec without defining it. +/// +/// In the example above, the MultiKernelLoaderSpec instance created by the +/// compiler can be used by the host code to create StreamExecutor kernel +/// objects. In turn, those StreamExecutor kernel objects can be used by the +/// host code to launch the kernel on the device as desired. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_KERNELSPEC_H +#define STREAMEXECUTOR_KERNELSPEC_H + +#include +#include +#include +#include + +#include "llvm/Support/RWMutex.h" + +namespace streamexecutor { + +/// An object that knows how to find the code for a device kernel. +/// +/// This is the base class for the hierarchy of loader specs. The different +/// subclasses know how to find code in different formats (e.g. CUDA PTX, OpenCL +/// binary). +/// +/// This base class has functionality for storing and getting the name of the +/// kernel as a string. +class KernelLoaderSpec { +public: + /// Returns the name of the kernel this spec loads. + const std::string &getKernelName() const { return KernelName; } + +protected: + /// Allows subclasses to set the kernel name during construction. + explicit KernelLoaderSpec(const std::string &KernelName); + +private: + std::string KernelName; + + KernelLoaderSpec(const KernelLoaderSpec &) = delete; + void operator=(const KernelLoaderSpec &) = delete; +}; + +/// A KernelLoaderSpec for CUDA PTX code that resides in memory. +class CUDAPTXInMemory : public KernelLoaderSpec { +public: + /// First component is major version, second component is minor version. + /// + /// A tuple is used because this will be the key of a map. + using ComputeCapability = std::tuple; + + /// PTX code combined with its compute capability. + struct PTXSpec { + ComputeCapability TheComputeCapability; + const char *PTXCode; + }; + + /// Single-PTX constructor. + /// + /// Adds the provided PTX version with the minimum compute capability. + /// + /// Since the compute capability is unknown, the PTX is assumed to be very + /// generally usable - in other words, PTX specified in this manner is VERY + /// likely to be used as the default. + /// + /// Holds a reference to the passed in PTXCode memory. Does not make a copy + /// and does not take ownership. + CUDAPTXInMemory(const char *PTXCode, const std::string &KernelName); + + /// Multiple-PTX-version constructor. + /// + /// Adds each item in spec_list to this object. + /// + /// Holds a reference to each passed in PTXCode memory. Does not make a copy + /// and does not take ownership. + CUDAPTXInMemory(const std::initializer_list &SpecList, + const std::string &KernelName); + + /// Returns a pointer to the PTX code with the lowest-valued compute + /// capability. + /// + /// For example, if PTX codes of compute capabilities 2.0, 3.0, and 3.5 are + /// all available, the version for compute capability 2.0 will be returned. + /// Returns nullptr on failed lookup (if no version is available). + const char *getDefaultCode() const; + + /// Returns a pointer to the PTX code for the requested compute capability. + /// + /// Returns nullptr on failed lookup (if the requested version is not + /// available). + const char *getCode(int ComputeCapabilityMajor, + int ComputeCapabilityMinor) const; + +private: + /// PTX code contents in memory. + /// + /// The key is a tuple ",", i.e., "2,0", "3,0", "3,5". + /// Because compute capabilities represented in this way have a clear sorting + /// order, map::begin() will give the lowest-numbered version available, i.e. + /// the default. + std::map PTXByComputeCapability; + + /// Defines the minimum compute capability possible. + /// + /// Used when PTX has no compute capability specified (in the single-PTX + /// constructor). + static const ComputeCapability MinimumComputeCapability; + + CUDAPTXInMemory(const CUDAPTXInMemory &) = delete; + void operator=(const CUDAPTXInMemory &) = delete; +}; + +/// A KernelLoaderSpec for CUDA cubin code that resides in memory. +class CUDACubinInMemory : public KernelLoaderSpec { +public: + /// Creates a CUDACubinInMemory with a reference to the given cubin bytes. + /// + /// Holds a reference to the passed in Bytes memory. Does not make a copy and + /// does not take ownership. + CUDACubinInMemory(const char *Bytes, const std::string &KernelName); + + /// Gets the cubin data bytes. + const char *getBytes() const { return Bytes; } + +private: + const char *Bytes; + + CUDACubinInMemory(const CUDACubinInMemory &) = delete; + void operator=(const CUDACubinInMemory &) = delete; +}; + +/// A KernelLoaderSpec for OpenCL text that resides in memory. +class OpenCLTextInMemory : public KernelLoaderSpec { +public: + /// Creates a OpenCLTextInMemory with a reference to the given OpenCL text + /// code bytes. + /// + /// Holds a reference to the passed in Text memory. Does not make a copy and + /// does not take ownership. + OpenCLTextInMemory(const char *Text, const std::string &KernelName); + + /// Returns the OpenCL text contents. + const std::string &getText() const { return Text; } + +private: + /// OpenCL translation unit text contents in memory. + std::string Text; + + OpenCLTextInMemory(const OpenCLTextInMemory &) = delete; + void operator=(const OpenCLTextInMemory &) = delete; +}; + +/// An object to store several different KernelLoaderSpecs for the same kernel. +/// +/// This allows code in different formats and for different platforms to be +/// stored all together for a single kernel. +/// +/// Various methods are available to add a new KernelLoaderSpec to a +/// MultiKernelLoaderSpec. There are also methods to query which formats and +/// platforms are supported by the currently added KernelLoaderSpec objects, and +/// methods to get the KernelLoaderSpec objects for each format and platform. +/// +/// Since all stored KernelLoaderSpecs are supposed to reference the same +/// kernel, they are all assumed to take the same number and type of parameters, +/// and to have the same name, but no checking is done to enforce this. Since +/// this interface is prone to errors, it is better to leave +/// MultiKernelLoaderSpec creation and initialization to the compiler rather +/// than doing it by hand. +class MultiKernelLoaderSpec { +public: + // Convenience getters for testing whether these platform variants have + // kernel loader specifications available. + + bool hasCUDAPTXInMemory() const { return TheCUDAPTXInMemory != nullptr; } + bool hasCUDACubinInMemory() const { return TheCUDACubinInMemory != nullptr; } + bool hasOpenCLTextInMemory() const { + return TheOpenCLTextInMemory != nullptr; + } + + // Accessors for platform variant kernel load specifications. + // + // Precondition: corresponding has* method returns true. + + const CUDACubinInMemory &getCUDACubinInMemory() const { + assert(hasCUDACubinInMemory()); + return *TheCUDACubinInMemory; + } + const CUDAPTXInMemory &getCUDAPTXInMemory() const { + assert(hasCUDAPTXInMemory()); + return *TheCUDAPTXInMemory; + } + const OpenCLTextInMemory &getOpenCLTextInMemory() const { + assert(hasOpenCLTextInMemory()); + return *TheOpenCLTextInMemory; + } + + // Builder-pattern-like methods for use in initializing a + // MultiKernelLoaderSpec. + // + // Each of these should be used at most once for a single + // MultiKernelLoaderSpec object. See file comment for example usage. + // + // Note that the KernelName parameter must be consistent with the kernel in + // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel + // name may be mangled by the compiler if it is not declared in an extern "C" + // scope. + + /// Holds a reference to the PTXCode memory. Does not make a copy and does not + /// take ownership. + MultiKernelLoaderSpec *addCUDAPTXInMemory(const char *PTXCode, + const std::string &KernelName); + /// Holds a reference to each PTX code memory in SpecList. Does not make a + /// copy and does not take ownership. + MultiKernelLoaderSpec * + addCUDAPTXInMemory(std::initializer_list SpecList, + const std::string &KernelName); + + /// Holds a reference to the CubinBytes memory. Does not make a copy and does + /// not take ownership. + MultiKernelLoaderSpec *addCUDACubinInMemory(const char *CubinBytes, + const std::string &KernelName); + + /// Holds a reference to the OpenCLText memory. Does not make a copy and does + /// not take ownership. + MultiKernelLoaderSpec *addOpenCLTextInMemory(const char *OpenCLText, + const std::string &KernelName); + +private: + /// PTX code that resides in memory. + std::unique_ptr TheCUDAPTXInMemory; + + /// Binary CUDA program in memory. + std::unique_ptr TheCUDACubinInMemory; + + /// OpenCL text that resides in memory. + std::unique_ptr TheOpenCLTextInMemory; +}; + +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_KERNELSPEC_H Index: streamexecutor/lib/CMakeLists.txt =================================================================== --- streamexecutor/lib/CMakeLists.txt +++ streamexecutor/lib/CMakeLists.txt @@ -2,3 +2,8 @@ utils OBJECT Utils/Error.cpp) + +add_library( + streamexecutor + $ + KernelSpec.cpp) Index: streamexecutor/lib/KernelSpec.cpp =================================================================== --- /dev/null +++ streamexecutor/lib/KernelSpec.cpp @@ -0,0 +1,105 @@ +//===-- KernelSpec.cpp - General kernel spec implementation ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the implementation details for kernel loader specs. +/// +//===----------------------------------------------------------------------===// + +#include "streamexecutor/KernelSpec.h" + +#include +#include + +#include "llvm/ADT/STLExtras.h" + +namespace streamexecutor { + +KernelLoaderSpec::KernelLoaderSpec(const std::string &KernelName) + : KernelName(KernelName) {} + +// Minimum compute capability is 1.0. +const CUDAPTXInMemory::ComputeCapability + CUDAPTXInMemory::MinimumComputeCapability{1, 0}; + +CUDAPTXInMemory::CUDAPTXInMemory(const char *PTXCode, + const std::string &KernelName) + : KernelLoaderSpec(KernelName) { + PTXByComputeCapability.emplace(MinimumComputeCapability, PTXCode); +} + +CUDAPTXInMemory::CUDAPTXInMemory( + const std::initializer_list &SpecList, + const std::string &KernelName) + : KernelLoaderSpec(KernelName) { + for (const auto &Spec : SpecList) { + PTXByComputeCapability.emplace(Spec.TheComputeCapability, Spec.PTXCode); + } +} + +const char *CUDAPTXInMemory::getDefaultCode() const { + if (PTXByComputeCapability.empty()) { + return nullptr; + } + return PTXByComputeCapability.begin()->second; +} + +const char *CUDAPTXInMemory::getCode(int ComputeCapabilityMajor, + int ComputeCapabilityMinor) const { + auto PTXIter = PTXByComputeCapability.find(CUDAPTXInMemory::ComputeCapability{ + ComputeCapabilityMajor, ComputeCapabilityMinor}); + if (PTXIter == PTXByComputeCapability.end()) { + return nullptr; + } + return PTXIter->second; +} + +CUDACubinInMemory::CUDACubinInMemory(const char *Bytes, + const std::string &KernelName) + : KernelLoaderSpec(KernelName), Bytes(Bytes) {} + +OpenCLTextInMemory::OpenCLTextInMemory(const char *Text, + const std::string &KernelName) + : KernelLoaderSpec(KernelName), Text(Text) {} + +MultiKernelLoaderSpec * +MultiKernelLoaderSpec::addCUDAPTXInMemory(const char *PTXCode, + const std::string &KernelName) { + assert(TheCUDAPTXInMemory == nullptr); + TheCUDAPTXInMemory = llvm::make_unique(PTXCode, KernelName); + return this; +} + +MultiKernelLoaderSpec *MultiKernelLoaderSpec::addCUDAPTXInMemory( + std::initializer_list SpecList, + const std::string &KernelName) { + assert(TheCUDAPTXInMemory == nullptr); + TheCUDAPTXInMemory = llvm::make_unique(SpecList, KernelName); + return this; +} + +MultiKernelLoaderSpec * +MultiKernelLoaderSpec::addCUDACubinInMemory(const char *Bytes, + const std::string &KernelName) { + assert(TheCUDACubinInMemory == nullptr); + TheCUDACubinInMemory = + llvm::make_unique(Bytes, KernelName); + return this; +} + +MultiKernelLoaderSpec * +MultiKernelLoaderSpec::addOpenCLTextInMemory(const char *OpenCLText, + const std::string &KernelName) { + assert(TheOpenCLTextInMemory == nullptr); + TheOpenCLTextInMemory = + llvm::make_unique(OpenCLText, KernelName); + return this; +} + +} // namespace streamexecutor