Index: streamexecutor/CMakeLists.txt =================================================================== --- streamexecutor/CMakeLists.txt +++ streamexecutor/CMakeLists.txt @@ -32,6 +32,7 @@ enable_testing() find_package(GTest REQUIRED) include_directories(${GTEST_INCLUDE_DIRS}) + find_package(Threads REQUIRED) endif() else(NOT STREAM_EXECUTOR_STANDALONE) if(STREAM_EXECUTOR_UNIT_TESTS) Index: streamexecutor/include/streamexecutor/KernelSpec.h =================================================================== --- /dev/null +++ streamexecutor/include/streamexecutor/KernelSpec.h @@ -0,0 +1,263 @@ +//===-- KernelSpec.h - Kernel loader spec types -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// KernelLoaderSpec is the base class for types that know where to find the +/// code for a data-parallel kernel in a particular format on a particular +/// platform. So, for example, there will be one subclass that deals with CUDA +/// PTX code, another subclass that deals with CUDA fatbin code, and yet another +/// subclass that deals with OpenCL text code. +/// +/// A MultiKernelLoaderSpec is basically a collection of KernelLoaderSpec +/// instances. This is useful when code is available for the same kernel in +/// several different formats or targeted for several different platforms. All +/// the various KernelLoaderSpec instances for this kernel can be combined +/// together in one MultiKernelLoaderSpec and the specific platform consumer can +/// decide which instance of the code it wants to use. +/// +/// MultiKernelLoaderSpec provides several helper functions to build and +/// register KernelLoaderSpec instances all in a single operation. For example, +/// MultiKernelLoaderSpec::addCUDAPTXInMemory can be used to construct and +/// register a CUDAPTXInMemorySpec KernelLoaderSpec. +/// +/// The loader spec classes declared here are designed primarily to be +/// instantiated by the compiler, but they can also be instantiated directly by +/// the user. A simplified example workflow which a compiler might follow in the +/// case of a CUDA kernel that is compiled to CUDA fatbin code is as follows: +/// +/// 1. The user defines a kernel function called UserKernel. +/// 2. The compiler compiles the kernel code into CUDA fatbin data and embeds +/// that data into the host code at address __UserKernelFatbinAddress. +/// 3. The compiler adds code at the beginning of the host code to instantiate a +/// MultiKernelLoaderSpec: +/// \code +/// namespace compiler_cuda_namespace { +/// MultiKernelLoaderSpec UserKernelLoaderSpec; +/// } // namespace compiler_cuda_namespace +/// \endcode +/// 4. The compiler then adds code to the host code to add the fatbin data to +/// the new MultiKernelLoaderSpec, and to associate that data with the kernel +/// name "UserKernel": +/// \code +/// namespace compiler_cuda_namespace { +/// UserKernelLoaderSpec.addCUDAFatbinInMemory( +/// __UserKernelFatbinAddress, "UserKernel"); +/// } // namespace compiler_cuda_namespace +/// \encode +/// 5. The host code, having known beforehand that the compiler would initialize +/// a MultiKernelLoaderSpec based on the name of the CUDA kernel, makes use +/// of the symbol cudanamespace::UserKernelLoaderSpec without defining it. +/// +/// In the example above, the MultiKernelLoaderSpec instance created by the +/// compiler can be used by the host code to create StreamExecutor kernel +/// objects. In turn, those StreamExecutor kernel objects can be used by the +/// host code to launch the kernel on the device as desired. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_KERNELSPEC_H +#define STREAMEXECUTOR_KERNELSPEC_H + +#include +#include +#include +#include + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" + +namespace streamexecutor { + +/// An object that knows how to find the code for a device kernel. +/// +/// This is the base class for the hierarchy of loader specs. The different +/// subclasses know how to find code in different formats (e.g. CUDA PTX, OpenCL +/// binary). +/// +/// This base class has functionality for storing and getting the name of the +/// kernel as a string. +class KernelLoaderSpec { +public: + /// Returns the name of the kernel this spec loads. + const std::string &getKernelName() const { return KernelName; } + +protected: + explicit KernelLoaderSpec(llvm::StringRef KernelName); + +private: + std::string KernelName; + + KernelLoaderSpec(const KernelLoaderSpec &) = delete; + KernelLoaderSpec &operator=(const KernelLoaderSpec &) = delete; +}; + +/// A KernelLoaderSpec for CUDA PTX code that resides in memory as a +/// null-terminated string. +class CUDAPTXInMemorySpec : public KernelLoaderSpec { +public: + /// First component is major version, second component is minor version. + using ComputeCapability = std::pair; + + /// PTX code combined with its compute capability. + struct PTXSpec { + ComputeCapability TheComputeCapability; + const char *PTXCode; + }; + + /// Creates a CUDAPTXInMemorySpec from an array of PTXSpec objects. + /// + /// Adds each item in SpecList to this object. + /// + /// Does not take ownership of the PTXCode pointers in the SpecList elements. + CUDAPTXInMemorySpec(llvm::StringRef KernelName, + const llvm::ArrayRef SpecList); + + /// Returns a pointer to the PTX code for the requested compute capability. + /// + /// Returns nullptr on failed lookup (if the requested compute capability is + /// not available). Matches exactly the specified compute capability. Doesn't + /// try to do anything smart like finding the next best compute capability if + /// the specified capability cannot be found. + const char *getCode(int ComputeCapabilityMajor, + int ComputeCapabilityMinor) const; + +private: + /// PTX code contents in memory. + /// + /// The key is a pair (cc_major, cc_minor), i.e., (2, 0), (3, 0), (3, 5). + std::map PTXByComputeCapability; + + CUDAPTXInMemorySpec(const CUDAPTXInMemorySpec &) = delete; + CUDAPTXInMemorySpec &operator=(const CUDAPTXInMemorySpec &) = delete; +}; + +/// A KernelLoaderSpec for CUDA fatbin code that resides in memory. +class CUDAFatbinInMemorySpec : public KernelLoaderSpec { +public: + /// Creates a CUDAFatbinInMemorySpec with a reference to the given fatbin + /// bytes. + /// + /// Does not take ownership of the Bytes pointer. + CUDAFatbinInMemorySpec(llvm::StringRef KernelName, const void *Bytes); + + /// Gets the fatbin data bytes. + const void *getBytes() const { return Bytes; } + +private: + const void *Bytes; + + CUDAFatbinInMemorySpec(const CUDAFatbinInMemorySpec &) = delete; + CUDAFatbinInMemorySpec &operator=(const CUDAFatbinInMemorySpec &) = delete; +}; + +/// A KernelLoaderSpec for OpenCL text that resides in memory as a +/// null-terminated string. +class OpenCLTextInMemorySpec : public KernelLoaderSpec { +public: + /// Creates a OpenCLTextInMemorySpec with a reference to the given OpenCL text + /// code bytes. + /// + /// Does not take ownership of the Text pointer. + OpenCLTextInMemorySpec(llvm::StringRef KernelName, const char *Text); + + /// Returns the OpenCL text contents. + const char *getText() const { return Text; } + +private: + const char *Text; + + OpenCLTextInMemorySpec(const OpenCLTextInMemorySpec &) = delete; + OpenCLTextInMemorySpec &operator=(const OpenCLTextInMemorySpec &) = delete; +}; + +/// An object to store several different KernelLoaderSpecs for the same kernel. +/// +/// This allows code in different formats and for different platforms to be +/// stored all together for a single kernel. +/// +/// Various methods are available to add a new KernelLoaderSpec to a +/// MultiKernelLoaderSpec. There are also methods to query which formats and +/// platforms are supported by the currently added KernelLoaderSpec objects, and +/// methods to get the KernelLoaderSpec objects for each format and platform. +/// +/// Since all stored KernelLoaderSpecs are supposed to reference the same +/// kernel, they are all assumed to take the same number and type of parameters, +/// but no checking is done to enforce this. In debug mode, all +/// KernelLoaderSpecs are checked to make sure they have the same kernel name, +/// so passing in specs with different kernel names can cause the program to +/// abort. +/// +/// This interface is prone to errors, so it is better to leave +/// MultiKernelLoaderSpec creation and initialization to the compiler rather +/// than doing it by hand. +class MultiKernelLoaderSpec { +public: + // Convenience getters for testing whether these platform variants have + // kernel loader specifications available. + + bool hasCUDAPTXInMemory() const { return TheCUDAPTXInMemorySpec != nullptr; } + bool hasCUDAFatbinInMemory() const { + return TheCUDAFatbinInMemorySpec != nullptr; + } + bool hasOpenCLTextInMemory() const { + return TheOpenCLTextInMemorySpec != nullptr; + } + + // Accessors for platform variant kernel load specifications. + // + // Precondition: corresponding has* method returns true. + + const CUDAPTXInMemorySpec &getCUDAPTXInMemory() const { + assert(hasCUDAPTXInMemory() && "getting spec that is not present"); + return *TheCUDAPTXInMemorySpec; + } + const CUDAFatbinInMemorySpec &getCUDAFatbinInMemory() const { + assert(hasCUDAFatbinInMemory() && "getting spec that is not present"); + return *TheCUDAFatbinInMemorySpec; + } + const OpenCLTextInMemorySpec &getOpenCLTextInMemory() const { + assert(hasOpenCLTextInMemory() && "getting spec that is not present"); + return *TheOpenCLTextInMemorySpec; + } + + // Builder-pattern-like methods for use in initializing a + // MultiKernelLoaderSpec. + // + // Each of these should be used at most once for a single + // MultiKernelLoaderSpec object. See file comment for example usage. + // + // Note that the KernelName parameter must be consistent with the kernel in + // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel + // name may be mangled by the compiler if it is not declared extern "C". + + /// Does not take ownership of the PTXCode pointers in the SpecList elements. + MultiKernelLoaderSpec & + addCUDAPTXInMemory(llvm::StringRef KernelName, + llvm::ArrayRef SpecList); + + /// Does not take ownership of the FatbinBytes pointer. + MultiKernelLoaderSpec &addCUDAFatbinInMemory(llvm::StringRef KernelName, + const void *FatbinBytes); + + /// Does not take ownership of the OpenCLText pointer. + MultiKernelLoaderSpec &addOpenCLTextInMemory(llvm::StringRef KernelName, + const char *OpenCLText); + +private: + void setKernelName(llvm::StringRef KernelName); + + std::unique_ptr TheKernelName; + std::unique_ptr TheCUDAPTXInMemorySpec; + std::unique_ptr TheCUDAFatbinInMemorySpec; + std::unique_ptr TheOpenCLTextInMemorySpec; +}; + +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_KERNELSPEC_H Index: streamexecutor/lib/CMakeLists.txt =================================================================== --- streamexecutor/lib/CMakeLists.txt +++ streamexecutor/lib/CMakeLists.txt @@ -2,3 +2,12 @@ utils OBJECT Utils/Error.cpp) + +add_library( + streamexecutor + $ + KernelSpec.cpp) + +if(STREAM_EXECUTOR_UNIT_TESTS) + add_subdirectory(unittests) +endif() Index: streamexecutor/lib/KernelSpec.cpp =================================================================== --- /dev/null +++ streamexecutor/lib/KernelSpec.cpp @@ -0,0 +1,94 @@ +//===-- KernelSpec.cpp - General kernel spec implementation ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the implementation details for kernel loader specs. +/// +//===----------------------------------------------------------------------===// + +#include "streamexecutor/KernelSpec.h" + +#include "llvm/ADT/STLExtras.h" + +namespace streamexecutor { + +KernelLoaderSpec::KernelLoaderSpec(llvm::StringRef KernelName) + : KernelName(KernelName) {} + +CUDAPTXInMemorySpec::CUDAPTXInMemorySpec( + llvm::StringRef KernelName, + const llvm::ArrayRef SpecList) + : KernelLoaderSpec(KernelName) { + for (const auto &Spec : SpecList) { + PTXByComputeCapability.emplace(Spec.TheComputeCapability, Spec.PTXCode); + } +} + +const char *CUDAPTXInMemorySpec::getCode(int ComputeCapabilityMajor, + int ComputeCapabilityMinor) const { + auto PTXIter = + PTXByComputeCapability.find(CUDAPTXInMemorySpec::ComputeCapability{ + ComputeCapabilityMajor, ComputeCapabilityMinor}); + if (PTXIter == PTXByComputeCapability.end()) { + return nullptr; + } + return PTXIter->second; +} + +CUDAFatbinInMemorySpec::CUDAFatbinInMemorySpec(llvm::StringRef KernelName, + const void *Bytes) + : KernelLoaderSpec(KernelName), Bytes(Bytes) {} + +OpenCLTextInMemorySpec::OpenCLTextInMemorySpec(llvm::StringRef KernelName, + const char *Text) + : KernelLoaderSpec(KernelName), Text(Text) {} + +void MultiKernelLoaderSpec::setKernelName(llvm::StringRef KernelName) { + if (TheKernelName) { + assert(KernelName.equals(*TheKernelName) && + "different kernel names in one MultiKernelLoaderSpec"); + } else { + TheKernelName = llvm::make_unique(KernelName); + } +} + +MultiKernelLoaderSpec &MultiKernelLoaderSpec::addCUDAPTXInMemory( + llvm::StringRef KernelName, + llvm::ArrayRef SpecList) { + assert((TheCUDAPTXInMemorySpec == nullptr) && + "illegal loader spec overwrite"); + setKernelName(KernelName); + TheCUDAPTXInMemorySpec = + llvm::make_unique(KernelName, SpecList); + return *this; +} + +MultiKernelLoaderSpec & +MultiKernelLoaderSpec::addCUDAFatbinInMemory(llvm::StringRef KernelName, + const void *Bytes) { + assert((TheCUDAFatbinInMemorySpec == nullptr) && + "illegal loader spec overwrite"); + setKernelName(KernelName); + TheCUDAFatbinInMemorySpec = + llvm::make_unique(KernelName, Bytes); + return *this; +} + +MultiKernelLoaderSpec & +MultiKernelLoaderSpec::addOpenCLTextInMemory(llvm::StringRef KernelName, + const char *OpenCLText) { + assert((TheOpenCLTextInMemorySpec == nullptr) && + "illegal loader spec overwrite"); + setKernelName(KernelName); + TheOpenCLTextInMemorySpec = + llvm::make_unique(KernelName, OpenCLText); + return *this; +} + +} // namespace streamexecutor Index: streamexecutor/lib/unittests/CMakeLists.txt =================================================================== --- /dev/null +++ streamexecutor/lib/unittests/CMakeLists.txt @@ -0,0 +1,9 @@ +add_executable( + kernel_spec_test + KernelSpecTest.cpp) +target_link_libraries( + kernel_spec_test + streamexecutor + ${GTEST_BOTH_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT}) +add_test(KernelSpecTest kernel_spec_test) Index: streamexecutor/lib/unittests/KernelSpecTest.cpp =================================================================== --- /dev/null +++ streamexecutor/lib/unittests/KernelSpecTest.cpp @@ -0,0 +1,132 @@ +//===-- KernelSpecTest.cpp - Tests for KernelSpec -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the unit tests for the code in KernelSpec. +/// +//===----------------------------------------------------------------------===// + +#include "streamexecutor/KernelSpec.h" + +#include "gtest/gtest.h" + +namespace { + +namespace se = ::streamexecutor; + +TEST(CUDAPTXInMemorySpec, NoCode) { + se::CUDAPTXInMemorySpec Spec("KernelName", {}); + EXPECT_EQ("KernelName", Spec.getKernelName()); + EXPECT_EQ(nullptr, Spec.getCode(1, 0)); +} + +TEST(CUDAPTXInMemorySpec, SingleComputeCapability) { + const char *PTXCodeString = "Dummy PTX code"; + se::CUDAPTXInMemorySpec Spec("KernelName", {{{1, 0}, PTXCodeString}}); + EXPECT_EQ("KernelName", Spec.getKernelName()); + EXPECT_EQ(PTXCodeString, Spec.getCode(1, 0)); + EXPECT_EQ(nullptr, Spec.getCode(2, 0)); +} + +TEST(CUDAPTXInMemorySpec, TwoComputeCapabilities) { + const char *PTXCodeString10 = "Dummy PTX code 10"; + const char *PTXCodeString30 = "Dummy PTX code 30"; + se::CUDAPTXInMemorySpec Spec( + "KernelName", {{{1, 0}, PTXCodeString10}, {{3, 0}, PTXCodeString30}}); + EXPECT_EQ("KernelName", Spec.getKernelName()); + EXPECT_EQ(PTXCodeString10, Spec.getCode(1, 0)); + EXPECT_EQ(PTXCodeString30, Spec.getCode(3, 0)); + EXPECT_EQ(nullptr, Spec.getCode(2, 0)); +} + +TEST(CUDAFatbinInMemorySpec, BasicUsage) { + const char *FatbinBytes = "Dummy fatbin bytes"; + se::CUDAFatbinInMemorySpec Spec("KernelName", FatbinBytes); + EXPECT_EQ("KernelName", Spec.getKernelName()); + EXPECT_EQ(FatbinBytes, Spec.getBytes()); +} + +TEST(OpenCLTextInMemorySpec, BasicUsage) { + const char *OpenCLText = "Dummy OpenCL text"; + se::OpenCLTextInMemorySpec Spec("KernelName", OpenCLText); + EXPECT_EQ("KernelName", Spec.getKernelName()); + EXPECT_EQ(OpenCLText, Spec.getText()); +} + +TEST(MultiKernelLoaderSpec, NoCode) { + se::MultiKernelLoaderSpec MultiSpec; + EXPECT_FALSE(MultiSpec.hasCUDAPTXInMemory()); + EXPECT_FALSE(MultiSpec.hasCUDAFatbinInMemory()); + EXPECT_FALSE(MultiSpec.hasOpenCLTextInMemory()); + + EXPECT_DEBUG_DEATH(MultiSpec.getCUDAPTXInMemory(), + "getting spec that is not present"); + EXPECT_DEBUG_DEATH(MultiSpec.getCUDAFatbinInMemory(), + "getting spec that is not present"); + EXPECT_DEBUG_DEATH(MultiSpec.getOpenCLTextInMemory(), + "getting spec that is not present"); +} + +TEST(MultiKernelLoaderSpec, Registration) { + se::MultiKernelLoaderSpec MultiSpec; + const char *KernelName = "KernelName"; + const char *PTXCodeString = "Dummy PTX code"; + const char *FatbinBytes = "Dummy fatbin bytes"; + const char *OpenCLText = "Dummy OpenCL text"; + + MultiSpec.addCUDAPTXInMemory(KernelName, {{{1, 0}, PTXCodeString}}) + .addCUDAFatbinInMemory(KernelName, FatbinBytes) + .addOpenCLTextInMemory(KernelName, OpenCLText); + + EXPECT_TRUE(MultiSpec.hasCUDAPTXInMemory()); + EXPECT_TRUE(MultiSpec.hasCUDAFatbinInMemory()); + EXPECT_TRUE(MultiSpec.hasOpenCLTextInMemory()); + + EXPECT_EQ(KernelName, MultiSpec.getCUDAPTXInMemory().getKernelName()); + EXPECT_EQ(PTXCodeString, MultiSpec.getCUDAPTXInMemory().getCode(1, 0)); + EXPECT_EQ(nullptr, MultiSpec.getCUDAPTXInMemory().getCode(2, 0)); + + EXPECT_EQ(KernelName, MultiSpec.getCUDAFatbinInMemory().getKernelName()); + EXPECT_EQ(FatbinBytes, MultiSpec.getCUDAFatbinInMemory().getBytes()); + + EXPECT_EQ(KernelName, MultiSpec.getOpenCLTextInMemory().getKernelName()); + EXPECT_EQ(OpenCLText, MultiSpec.getOpenCLTextInMemory().getText()); +} + +TEST(MultiKernelLoaderSpec, RegisterTwice) { + se::MultiKernelLoaderSpec MultiSpec; + const char *KernelName = "KernelName"; + const char *FatbinBytes = "Dummy fatbin bytes"; + + MultiSpec.addCUDAFatbinInMemory(KernelName, FatbinBytes); + + EXPECT_DEBUG_DEATH(MultiSpec.addCUDAFatbinInMemory(KernelName, FatbinBytes), + "illegal loader spec overwrite"); +} + +TEST(MultiKernelLoaderSpec, ConflictingKernelNames) { + se::MultiKernelLoaderSpec MultiSpec; + const char *KernelNameA = "KernelName"; + std::string KernelNameB = KernelNameA; + const char *PTXCodeString = "Dummy PTX code"; + const char *FatbinBytes = "Dummy fatbin bytes"; + + // Check that names don't conflict if they are equivalent strings in different + // locations. + MultiSpec.addCUDAPTXInMemory(KernelNameA, {{{1, 0}, PTXCodeString}}) + .addCUDAFatbinInMemory(KernelNameB, FatbinBytes); + + const char *OtherKernelName = "OtherKernelName"; + const char *OpenCLText = "Dummy OpenCL text"; + EXPECT_DEBUG_DEATH( + MultiSpec.addOpenCLTextInMemory(OtherKernelName, OpenCLText), + "different kernel names in one MultiKernelLoaderSpec"); +} + +} // namespace