Index: parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt
===================================================================
--- parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt
+++ parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt
@@ -1,2 +1,5 @@
 add_executable(cuda_saxpy_example CUDASaxpy.cpp)
 target_link_libraries(cuda_saxpy_example streamexecutor)
+
+add_executable(host_saxpy_example HostSaxpy.cpp)
+target_link_libraries(host_saxpy_example streamexecutor)
Index: parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp
===================================================================
--- parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp
+++ parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp
@@ -17,7 +17,6 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cstdio>
 #include <cstdlib>
 #include <vector>
 
Index: parallel-libs/trunk/streamexecutor/examples/HostSaxpy.cpp
===================================================================
--- parallel-libs/trunk/streamexecutor/examples/HostSaxpy.cpp
+++ parallel-libs/trunk/streamexecutor/examples/HostSaxpy.cpp
@@ -0,0 +1,94 @@
+//===-- HostSaxpy.cpp - Example of host saxpy with StreamExecutor API -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains example code demonstrating the usage of the
+/// StreamExecutor API for a host platform.
+///
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <vector>
+
+#include "streamexecutor/StreamExecutor.h"
+
+void Saxpy(float A, float *X, float *Y, size_t N) {
+  for (size_t I = 0; I < N; ++I)
+    X[I] = A * X[I] + Y[I];
+}
+
+namespace __compilergen {
+using SaxpyKernel =
+    streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
+                           streamexecutor::GlobalDeviceMemory<float>, size_t>;
+
+// Wrapper function converts argument addresses to arguments.
+void SaxpyWrapper(const void *const *ArgumentAddresses) {
+  Saxpy(*static_cast<const float *>(ArgumentAddresses[0]),
+        static_cast<float *>(const_cast<void *>(ArgumentAddresses[1])),
+        static_cast<float *>(const_cast<void *>(ArgumentAddresses[2])),
+        *static_cast<const size_t *>(ArgumentAddresses[3]));
+}
+
+// The wrapper function is what gets registered.
+static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
+  streamexecutor::MultiKernelLoaderSpec Spec;
+  Spec.addHostFunction("Saxpy", SaxpyWrapper);
+  return Spec;
+}();
+} // namespace __compilergen
+
+int main() {
+  namespace se = ::streamexecutor;
+  namespace cg = ::__compilergen;
+
+  // Create some host data.
+  float A = 42.0f;
+  std::vector<float> HostX = {0, 1, 2, 3};
+  std::vector<float> HostY = {4, 5, 6, 7};
+  size_t ArraySize = HostX.size();
+
+  // Get a device object.
+  se::Platform *Platform =
+      getOrDie(se::PlatformManager::getPlatformByName("host"));
+  if (Platform->getDeviceCount() == 0) {
+    return EXIT_FAILURE;
+  }
+  se::Device *Device = getOrDie(Platform->getDevice(0));
+
+  // Load the kernel onto the device.
+  cg::SaxpyKernel Kernel =
+      getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
+
+  se::RegisteredHostMemory<float> RegisteredX =
+      getOrDie(Device->registerHostMemory<float>(HostX));
+  se::RegisteredHostMemory<float> RegisteredY =
+      getOrDie(Device->registerHostMemory<float>(HostY));
+
+  // Allocate memory on the device.
+  se::GlobalDeviceMemory<float> X =
+      getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
+  se::GlobalDeviceMemory<float> Y =
+      getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
+
+  // Run operations on a stream.
+  se::Stream Stream = getOrDie(Device->createStream());
+  Stream.thenCopyH2D(RegisteredX, X)
+      .thenCopyH2D(RegisteredY, Y)
+      .thenLaunch(1, 1, Kernel, A, X, Y, ArraySize)
+      .thenCopyD2H(X, RegisteredX);
+  // Wait for the stream to complete.
+  se::dieIfError(Stream.blockHostUntilDone());
+
+  // Process output data in HostX.
+  std::vector<float> ExpectedX = {4, 47, 90, 133};
+  assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
+}
Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h
===================================================================
--- parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h
+++ parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h
@@ -65,11 +65,13 @@
 #define STREAMEXECUTOR_KERNELSPEC_H
 
 #include <cassert>
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 
 namespace streamexecutor {
@@ -199,6 +201,9 @@
 /// than doing it by hand.
 class MultiKernelLoaderSpec {
 public:
+  /// Type of functions used as host platform kernels.
+  using HostFunctionTy = std::function<void(const void **)>;
+
   std::string getKernelName() const {
     if (TheKernelName)
       return *TheKernelName;
@@ -215,6 +220,7 @@
   bool hasOpenCLTextInMemory() const {
     return TheOpenCLTextInMemorySpec != nullptr;
   }
+  bool hasHostFunction() const { return HostFunction != nullptr; }
 
   // Accessors for platform variant kernel load specifications.
   //
@@ -233,6 +239,11 @@
     return *TheOpenCLTextInMemorySpec;
   }
 
+  const HostFunctionTy &getHostFunction() const {
+    assert(hasHostFunction() && "getting spec that is not present");
+    return *HostFunction;
+  }
+
   // Builder-pattern-like methods for use in initializing a
   // MultiKernelLoaderSpec.
   //
@@ -256,6 +267,12 @@
   MultiKernelLoaderSpec &addOpenCLTextInMemory(llvm::StringRef KernelName,
                                                const char *OpenCLText);
 
+  MultiKernelLoaderSpec &addHostFunction(llvm::StringRef KernelName,
+                                         HostFunctionTy Function) {
+    HostFunction = llvm::make_unique<HostFunctionTy>(std::move(Function));
+    return *this;
+  }
+
 private:
   void setKernelName(llvm::StringRef KernelName);
 
@@ -263,6 +280,7 @@
   std::unique_ptr<CUDAPTXInMemorySpec> TheCUDAPTXInMemorySpec;
   std::unique_ptr<CUDAFatbinInMemorySpec> TheCUDAFatbinInMemorySpec;
   std::unique_ptr<OpenCLTextInMemorySpec> TheOpenCLTextInMemorySpec;
+  std::unique_ptr<HostFunctionTy> HostFunction;
 };
 
 } // namespace streamexecutor
Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h
===================================================================
--- parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h
+++ parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h
@@ -149,10 +149,10 @@
   /// Similar to synchronousCopyD2H(const void *, size_t, void
   /// *, size_t, size_t), but copies memory from one location in device memory
   /// to another rather than from device to host.
-  virtual Error synchronousCopyD2D(const void *DeviceDstHandle,
-                                   size_t DstByteOffset,
-                                   const void *DeviceSrcHandle,
-                                   size_t SrcByteOffset, size_t ByteCount) {
+  virtual Error synchronousCopyD2D(const void *DeviceSrcHandle,
+                                   size_t SrcByteOffset,
+                                   const void *DeviceDstHandle,
+                                   size_t DstByteOffset, size_t ByteCount) {
     return make_error("synchronousCopyD2D not implemented for platform " +
                       getName());
   }
Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h
===================================================================
--- parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h
+++ parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h
@@ -0,0 +1,56 @@
+//===-- HostPlatform.h - Host platform subclass -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the HostPlatform class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
+#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
+
+#include "HostPlatformDevice.h"
+#include "streamexecutor/Device.h"
+#include "streamexecutor/Platform.h"
+
+#include "llvm/Support/Mutex.h"
+
+namespace streamexecutor {
+namespace host {
+
+/// Platform that performs work on the host rather than offloading to an
+/// accelerator.
+class HostPlatform : public Platform {
+public:
+  size_t getDeviceCount() const override { return 1; }
+
+  Expected<Device *> getDevice(size_t DeviceIndex) override {
+    if (DeviceIndex != 0) {
+      return make_error(
+          "Requested device index " + llvm::Twine(DeviceIndex) +
+          " from host platform which only supports device index 0");
+    }
+    llvm::sys::ScopedLock Lock(Mutex);
+    if (!TheDevice) {
+      ThePlatformDevice = llvm::make_unique<HostPlatformDevice>();
+      TheDevice = llvm::make_unique<Device>(ThePlatformDevice.get());
+    }
+    return TheDevice.get();
+  }
+
+private:
+  llvm::sys::Mutex Mutex;
+  std::unique_ptr<HostPlatformDevice> ThePlatformDevice;
+  std::unique_ptr<Device> TheDevice;
+};
+
+} // namespace host
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
===================================================================
--- parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
+++ parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
@@ -0,0 +1,151 @@
+//===-- HostPlatformDevice.h - HostPlatformDevice class ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the HostPlatformDevice class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
+#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
+
+#include <cstdlib>
+#include <cstring>
+
+#include "streamexecutor/PlatformDevice.h"
+
+namespace streamexecutor {
+namespace host {
+
+/// A concrete PlatformDevice subclass that performs its work on the host rather
+/// than offloading to an accelerator.
+class HostPlatformDevice : public PlatformDevice {
+public:
+  std::string getName() const override { return "host"; }
+
+  Expected<const void *>
+  createKernel(const MultiKernelLoaderSpec &Spec) override {
+    if (!Spec.hasHostFunction()) {
+      return make_error("no host implementation available for kernel " +
+                        Spec.getKernelName());
+    }
+    return static_cast<const void *>(&Spec.getHostFunction());
+  }
+
+  Error destroyKernel(const void *Handle) override { return Error::success(); }
+
+  Expected<const void *> createStream() override {
+    // TODO(jhen): Do something with threads to allow multiple streams.
+    return this;
+  }
+
+  Error destroyStream(const void *Handle) override { return Error::success(); }
+
+  Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize,
+               GridDimensions GridSize, const void *PKernelHandle,
+               const PackedKernelArgumentArrayBase &ArgumentArray) override {
+    // TODO(jhen): Can we do something with BlockSize and GridSize?
+    if (!(BlockSize.X == 1 && BlockSize.Y == 1 && BlockSize.Z == 1)) {
+      return make_error(
+          "Block dimensions were (" + llvm::Twine(BlockSize.X) + "," +
+          llvm::Twine(BlockSize.Y) + "," + llvm::Twine(BlockSize.Z) +
+          "), but only size (1,1,1) is permitted for this platform");
+    }
+    if (!(GridSize.X == 1 && GridSize.Y == 1 && GridSize.Z == 1)) {
+      return make_error(
+          "Grid dimensions were (" + llvm::Twine(GridSize.X) + "," +
+          llvm::Twine(GridSize.Y) + "," + llvm::Twine(GridSize.Z) +
+          "), but only size (1,1,1) is permitted for this platform");
+    }
+
+    (*static_cast<const std::function<void(const void *const *)> *>(
+        PKernelHandle))(ArgumentArray.getAddresses());
+    return Error::success();
+  }
+
+  Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+                size_t SrcByteOffset, void *HostDst, size_t DstByteOffset,
+                size_t ByteCount) override {
+    std::memcpy(offset(HostDst, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
+                size_t SrcByteOffset, const void *DeviceDstHandle,
+                size_t DstByteOffset, size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(HostSrc, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+                size_t SrcByteOffset, const void *DeviceDstHandle,
+                size_t DstByteOffset, size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error blockHostUntilDone(const void *PlatformStreamHandle) override {
+    // All host operations are synchronous anyway.
+    return Error::success();
+  }
+
+  Expected<void *> allocateDeviceMemory(size_t ByteCount) override {
+    return std::malloc(ByteCount);
+  }
+
+  Error freeDeviceMemory(const void *Handle) override {
+    std::free(const_cast<void *>(Handle));
+    return Error::success();
+  }
+
+  Error registerHostMemory(void *Memory, size_t ByteCount) override {
+    return Error::success();
+  }
+
+  Error unregisterHostMemory(const void *Memory) override {
+    return Error::success();
+  }
+
+  Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset,
+                           void *HostDst, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(HostDst, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
+                           const void *DeviceDstHandle, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(HostSrc, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error synchronousCopyD2D(const void *DeviceSrcHandle, size_t SrcByteOffset,
+                           const void *DeviceDstHandle, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+private:
+  static void *offset(const void *Base, size_t Offset) {
+    return const_cast<char *>(static_cast<const char *>(Base) + Offset);
+  }
+};
+
+} // namespace host
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
Index: parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp
===================================================================
--- parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp
+++ parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "streamexecutor/PlatformManager.h"
+#include "streamexecutor/platforms/host/HostPlatform.h"
 
 namespace streamexecutor {
 
@@ -23,6 +24,8 @@
   //    appropriate code to include here.
   //  * Use static initialization tricks to have platform libraries register
   //    themselves when they are loaded.
+
+  PlatformsByName.emplace("host", llvm::make_unique<host::HostPlatform>());
 }
 
 Expected<Platform *> PlatformManager::getPlatformByName(llvm::StringRef Name) {