Index: streamexecutor/include/streamexecutor/KernelSpec.h
===================================================================
--- /dev/null
+++ streamexecutor/include/streamexecutor/KernelSpec.h
@@ -0,0 +1,298 @@
+//===-- KernelSpec.h - Kernel loader spec types -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// A KernelLoaderSpec is a class that knows where to find the code for a
+/// data-parallel kernel in a single format on a single platform. So, for
+/// example, there will be one subclass that deals with CUDA PTX code, another
+/// subclass that deals with CUDA cubin code, and yet another subclass that
+/// deals with OpenCL text code.
+///
+/// A MultiKernelLoaderSpec is basically a collection of KernelLoaderSpec
+/// instances. This is useful when code is available for the same kernel in
+/// several different formats or targeted for several different platforms. All
+/// the various KernelLoaderSpec instances for this kernel can be combined
+/// together in one MultiKernelLoaderSpec and the specific platform consumer can
+/// decide which instance of the code it wants to use.
+///
+/// Rather than instantiating each KernelLoaderSpec separately and then
+/// registering it with the MultiKernelLoaderSpec, the MultiKernelLoaderSpec
+/// provides several helper functions, such as
+/// MultiKernelLoaderSpec::addCUDAPTXInMemory, to construct the KernelLoaderSpec
+/// and add it to the MultiKernelLoaderSpec at the same time.
+///
+/// These loader spec classes are designed primarily to be instantiated by the
+/// compiler, but they can also be instantiated directly by the user. A
+/// simplified example workflow which a compiler might follow in the case of a
+/// CUDA kernel that is compiled to CUDA cubin code is as follows:
+///
+/// 1. The user defines a kernel function called UserKernel.
+/// 2. The compiler compiles the kernel code into cubin data and embeds that
+///    data into the host code at address __UserKernelCubinAddress.
+/// 3. The compiler adds code at the beginning of the host code to instantiate a
+///    MultiKernelLoaderSpec:
+///    \code
+///    namespace compiler_cuda_namespace {
+///      MultiKernelLoaderSpec UserKernelLoaderSpec();
+///    } // namespace compiler_cuda_namespace
+///    \endcode
+/// 4. The compiler then adds code to the host code to add the cubin data to the
+///    new MultiKernelLoaderSpec, and to associate that data with the kernel
+///    name "UserKernel":
+///    \code
+///    namespace compiler_cuda_namespace {
+///      UserKernelLoaderSpec.addCUDACubinInMemory(
+///        __UserKernelCubinAddress, "UserKernel");
+///    } // namespace compiler_cuda_namespace
+///    \encode
+/// 5. The host code, having known beforehand that the compiler would initialize
+///    a MultiKernelLoaderSpec based on the name of the CUDA kernel, makes use
+///    of the symbol cudanamespace::UserKernelLoaderSpec without defining it.
+///
+/// In the example above, the MultiKernelLoaderSpec instance created by the
+/// compiler can be used by the host code to create StreamExecutor kernel
+/// objects. In turn, those StreamExecutor kernel objects can be used by the
+/// host code to launch the kernel on the device as desired.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_KERNELSPEC_H
+#define STREAMEXECUTOR_KERNELSPEC_H
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "llvm/Support/RWMutex.h"
+
+namespace streamexecutor {
+
+/// An object that knows how to find the code for a device kernel.
+///
+/// This is the base class for the hierarchy of loader specs. The different
+/// subclasses know how to find code in different formats (e.g. CUDA PTX, OpenCL
+/// binary).
+///
+/// This base class has functionality for storing and getting the name of the
+/// kernel as a string.
+class KernelLoaderSpec {
+public:
+  /// Returns the name of the kernel this spec loads.
+  const std::string &getKernelName() const { return KernelName; }
+
+protected:
+  /// Allows subclasses to set the kernel name during construction.
+  explicit KernelLoaderSpec(const std::string &KernelName);
+
+private:
+  std::string KernelName;
+
+  KernelLoaderSpec(const KernelLoaderSpec &) = delete;
+  void operator=(const KernelLoaderSpec &) = delete;
+};
+
+/// A KernelLoaderSpec for CUDA PTX code that resides in memory.
+class CUDAPTXInMemory : public KernelLoaderSpec {
+public:
+  /// First component is major version, second component is minor version.
+  ///
+  /// A tuple is used because this will be the key of a map.
+  using ComputeCapability = std::tuple<int, int>;
+
+  /// PTX code combined with its compute capability.
+  struct PTXSpec {
+    ComputeCapability TheComputeCapability;
+    const char *PTXCode;
+  };
+
+  /// Single-PTX constructor.
+  ///
+  /// Adds the provided PTX version with the minimum compute capability.
+  ///
+  /// Since the compute capability is unknown, the PTX is assumed to be very
+  /// generally usable - in other words, PTX specified in this manner is VERY
+  /// likely to be used as the default.
+  ///
+  /// Holds a reference to the passed in PTXCode memory. Does not make a copy
+  /// and does not take ownership.
+  CUDAPTXInMemory(const char *PTXCode, const std::string &KernelName);
+
+  /// Multiple-PTX-version constructor.
+  ///
+  /// Adds each item in spec_list to this object.
+  ///
+  /// Holds a reference to each passed in PTXCode memory. Does not make a copy
+  /// and does not take ownership.
+  CUDAPTXInMemory(const std::initializer_list<PTXSpec> &SpecList,
+                  const std::string &KernelName);
+
+  /// Returns a pointer to the PTX code with the lowest-valued compute
+  /// capability.
+  ///
+  /// For example, if PTX codes of compute capabilities 2.0, 3.0, and 3.5 are
+  /// all available, the version for compute capability 2.0 will be returned.
+  /// Returns nullptr on failed lookup (if no version is available).
+  const char *getDefaultCode() const;
+
+  /// Returns a pointer to the PTX code for the requested compute capability.
+  ///
+  /// Returns nullptr on failed lookup (if the requested version is not
+  /// available).
+  const char *getCode(int ComputeCapabilityMajor,
+                      int ComputeCapabilityMinor) const;
+
+private:
+  /// PTX code contents in memory.
+  ///
+  /// The key is a tuple "<cc_major>,<cc_minor>", i.e., "2,0", "3,0", "3,5".
+  /// Because compute capabilities represented in this way have a clear sorting
+  /// order, map::begin() will give the lowest-numbered version available, i.e.
+  /// the default.
+  std::map<ComputeCapability, const char *> PTXByComputeCapability;
+
+  /// Defines the minimum compute capability possible.
+  ///
+  /// Used when PTX has no compute capability specified (in the single-PTX
+  /// constructor).
+  static const ComputeCapability MinimumComputeCapability;
+
+  CUDAPTXInMemory(const CUDAPTXInMemory &) = delete;
+  void operator=(const CUDAPTXInMemory &) = delete;
+};
+
+/// A KernelLoaderSpec for CUDA cubin code that resides in memory.
+class CUDACubinInMemory : public KernelLoaderSpec {
+public:
+  /// Creates a CUDACubinInMemory with a reference to the given cubin bytes.
+  ///
+  /// Holds a reference to the passed in Bytes memory. Does not make a copy and
+  /// does not take ownership.
+  CUDACubinInMemory(const char *Bytes, const std::string &KernelName);
+
+  /// Gets the cubin data bytes.
+  const char *getBytes() const { return Bytes; }
+
+private:
+  const char *Bytes;
+
+  CUDACubinInMemory(const CUDACubinInMemory &) = delete;
+  void operator=(const CUDACubinInMemory &) = delete;
+};
+
+/// A KernelLoaderSpec for OpenCL text that resides in memory.
+class OpenCLTextInMemory : public KernelLoaderSpec {
+public:
+  /// Creates a OpenCLTextInMemory with a reference to the given OpenCL text
+  /// code bytes.
+  ///
+  /// Holds a reference to the passed in Text memory. Does not make a copy and
+  /// does not take ownership.
+  OpenCLTextInMemory(const char *Text, const std::string &KernelName);
+
+  /// Returns the OpenCL text contents.
+  const std::string &getText() const { return Text; }
+
+private:
+  /// OpenCL translation unit text contents in memory.
+  std::string Text;
+
+  OpenCLTextInMemory(const OpenCLTextInMemory &) = delete;
+  void operator=(const OpenCLTextInMemory &) = delete;
+};
+
+/// An object to store several different KernelLoaderSpecs for the same kernel.
+///
+/// This allows code in different formats and for different platforms to be
+/// stored all together for a single kernel.
+///
+/// Various methods are available to add a new KernelLoaderSpec to a
+/// MultiKernelLoaderSpec. There are also methods to query which formats and
+/// platforms are supported by the currently added KernelLoaderSpec objects, and
+/// methods to get the KernelLoaderSpec objects for each format and platform.
+///
+/// Since all stored KernelLoaderSpecs are supposed to reference the same
+/// kernel, they are all assumed to take the same number and type of parameters,
+/// and to have the same name, but no checking is done to enforce this. Since
+/// this interface is prone to errors, it is better to leave
+/// MultiKernelLoaderSpec creation and initialization to the compiler rather
+/// than doing it by hand.
+class MultiKernelLoaderSpec {
+public:
+  // Convenience getters for testing whether these platform variants have
+  // kernel loader specifications available.
+
+  bool hasCUDAPTXInMemory() const { return TheCUDAPTXInMemory != nullptr; }
+  bool hasCUDACubinInMemory() const { return TheCUDACubinInMemory != nullptr; }
+  bool hasOpenCLTextInMemory() const {
+    return TheOpenCLTextInMemory != nullptr;
+  }
+
+  // Accessors for platform variant kernel load specifications.
+  //
+  // Precondition: corresponding has* method returns true.
+
+  const CUDACubinInMemory &getCUDACubinInMemory() const {
+    assert(hasCUDACubinInMemory());
+    return *TheCUDACubinInMemory;
+  }
+  const CUDAPTXInMemory &getCUDAPTXInMemory() const {
+    assert(hasCUDAPTXInMemory());
+    return *TheCUDAPTXInMemory;
+  }
+  const OpenCLTextInMemory &getOpenCLTextInMemory() const {
+    assert(hasOpenCLTextInMemory());
+    return *TheOpenCLTextInMemory;
+  }
+
+  // Builder-pattern-like methods for use in initializing a
+  // MultiKernelLoaderSpec.
+  //
+  // Each of these should be used at most once for a single
+  // MultiKernelLoaderSpec object. See file comment for example usage.
+  //
+  // Note that the KernelName parameter must be consistent with the kernel in
+  // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel
+  // name may be mangled by the compiler if it is not declared in an extern "C"
+  // scope.
+
+  /// Holds a reference to the PTXCode memory. Does not make a copy and does not
+  /// take ownership.
+  MultiKernelLoaderSpec *addCUDAPTXInMemory(const char *PTXCode,
+                                            const std::string &KernelName);
+  /// Holds a reference to each PTX code memory in SpecList. Does not make a
+  /// copy and does not take ownership.
+  MultiKernelLoaderSpec *
+  addCUDAPTXInMemory(std::initializer_list<CUDAPTXInMemory::PTXSpec> SpecList,
+                     const std::string &KernelName);
+
+  /// Holds a reference to the CubinBytes memory. Does not make a copy and does
+  /// not take ownership.
+  MultiKernelLoaderSpec *addCUDACubinInMemory(const char *CubinBytes,
+                                              const std::string &KernelName);
+
+  /// Holds a reference to the OpenCLText memory. Does not make a copy and does
+  /// not take ownership.
+  MultiKernelLoaderSpec *addOpenCLTextInMemory(const char *OpenCLText,
+                                               const std::string &KernelName);
+
+private:
+  /// PTX code that resides in memory.
+  std::unique_ptr<CUDAPTXInMemory> TheCUDAPTXInMemory;
+
+  /// Binary CUDA program in memory.
+  std::unique_ptr<CUDACubinInMemory> TheCUDACubinInMemory;
+
+  /// OpenCL text that resides in memory.
+  std::unique_ptr<OpenCLTextInMemory> TheOpenCLTextInMemory;
+};
+
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_KERNELSPEC_H
Index: streamexecutor/lib/CMakeLists.txt
===================================================================
--- streamexecutor/lib/CMakeLists.txt
+++ streamexecutor/lib/CMakeLists.txt
@@ -2,3 +2,8 @@
     utils
     OBJECT
     Utils/Error.cpp)
+
+add_library(
+    streamexecutor
+    $<TARGET_OBJECTS:utils>
+    KernelSpec.cpp)
Index: streamexecutor/lib/KernelSpec.cpp
===================================================================
--- /dev/null
+++ streamexecutor/lib/KernelSpec.cpp
@@ -0,0 +1,105 @@
+//===-- KernelSpec.cpp - General kernel spec implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the implementation details for kernel loader specs.
+///
+//===----------------------------------------------------------------------===//
+
+#include "streamexecutor/KernelSpec.h"
+
+#include <cassert>
+#include <string>
+
+#include "llvm/ADT/STLExtras.h"
+
+namespace streamexecutor {
+
+KernelLoaderSpec::KernelLoaderSpec(const std::string &KernelName)
+    : KernelName(KernelName) {}
+
+// Minimum compute capability is 1.0.
+const CUDAPTXInMemory::ComputeCapability
+    CUDAPTXInMemory::MinimumComputeCapability{1, 0};
+
+CUDAPTXInMemory::CUDAPTXInMemory(const char *PTXCode,
+                                 const std::string &KernelName)
+    : KernelLoaderSpec(KernelName) {
+  PTXByComputeCapability.emplace(MinimumComputeCapability, PTXCode);
+}
+
+CUDAPTXInMemory::CUDAPTXInMemory(
+    const std::initializer_list<CUDAPTXInMemory::PTXSpec> &SpecList,
+    const std::string &KernelName)
+    : KernelLoaderSpec(KernelName) {
+  for (const auto &Spec : SpecList) {
+    PTXByComputeCapability.emplace(Spec.TheComputeCapability, Spec.PTXCode);
+  }
+}
+
+const char *CUDAPTXInMemory::getDefaultCode() const {
+  if (PTXByComputeCapability.empty()) {
+    return nullptr;
+  }
+  return PTXByComputeCapability.begin()->second;
+}
+
+const char *CUDAPTXInMemory::getCode(int ComputeCapabilityMajor,
+                                     int ComputeCapabilityMinor) const {
+  auto PTXIter = PTXByComputeCapability.find(CUDAPTXInMemory::ComputeCapability{
+      ComputeCapabilityMajor, ComputeCapabilityMinor});
+  if (PTXIter == PTXByComputeCapability.end()) {
+    return nullptr;
+  }
+  return PTXIter->second;
+}
+
+CUDACubinInMemory::CUDACubinInMemory(const char *Bytes,
+                                     const std::string &KernelName)
+    : KernelLoaderSpec(KernelName), Bytes(Bytes) {}
+
+OpenCLTextInMemory::OpenCLTextInMemory(const char *Text,
+                                       const std::string &KernelName)
+    : KernelLoaderSpec(KernelName), Text(Text) {}
+
+MultiKernelLoaderSpec *
+MultiKernelLoaderSpec::addCUDAPTXInMemory(const char *PTXCode,
+                                          const std::string &KernelName) {
+  assert(TheCUDAPTXInMemory == nullptr);
+  TheCUDAPTXInMemory = llvm::make_unique<CUDAPTXInMemory>(PTXCode, KernelName);
+  return this;
+}
+
+MultiKernelLoaderSpec *MultiKernelLoaderSpec::addCUDAPTXInMemory(
+    std::initializer_list<CUDAPTXInMemory::PTXSpec> SpecList,
+    const std::string &KernelName) {
+  assert(TheCUDAPTXInMemory == nullptr);
+  TheCUDAPTXInMemory = llvm::make_unique<CUDAPTXInMemory>(SpecList, KernelName);
+  return this;
+}
+
+MultiKernelLoaderSpec *
+MultiKernelLoaderSpec::addCUDACubinInMemory(const char *Bytes,
+                                            const std::string &KernelName) {
+  assert(TheCUDACubinInMemory == nullptr);
+  TheCUDACubinInMemory =
+      llvm::make_unique<CUDACubinInMemory>(Bytes, KernelName);
+  return this;
+}
+
+MultiKernelLoaderSpec *
+MultiKernelLoaderSpec::addOpenCLTextInMemory(const char *OpenCLText,
+                                             const std::string &KernelName) {
+  assert(TheOpenCLTextInMemory == nullptr);
+  TheOpenCLTextInMemory =
+      llvm::make_unique<OpenCLTextInMemory>(OpenCLText, KernelName);
+  return this;
+}
+
+} // namespace streamexecutor