Index: streamexecutor/examples/CMakeLists.txt
===================================================================
--- streamexecutor/examples/CMakeLists.txt
+++ streamexecutor/examples/CMakeLists.txt
@@ -1,2 +1,5 @@
 add_executable(cuda_saxpy_example CUDASaxpy.cpp)
 target_link_libraries(cuda_saxpy_example streamexecutor)
+
+add_executable(host_saxpy_example HostSaxpy.cpp)
+target_link_libraries(host_saxpy_example streamexecutor)
Index: streamexecutor/examples/CUDASaxpy.cpp
===================================================================
--- streamexecutor/examples/CUDASaxpy.cpp
+++ streamexecutor/examples/CUDASaxpy.cpp
@@ -17,7 +17,6 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cstdio>
 #include <cstdlib>
 #include <vector>
 
Index: streamexecutor/examples/HostSaxpy.cpp
===================================================================
--- streamexecutor/examples/HostSaxpy.cpp
+++ streamexecutor/examples/HostSaxpy.cpp
@@ -1,4 +1,4 @@
-//===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===//
+//===-- HostSaxpy.cpp - Example of host saxpy with StreamExecutor API -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,91 +9,44 @@
 ///
 /// \file
 /// This file contains example code demonstrating the usage of the
-/// StreamExecutor API. Snippets of this file will be included as code examples
-/// in documentation. Taking these examples from a real source file guarantees
-/// that the examples will always compile.
+/// StreamExecutor API for a host platform.
 ///
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
 #include <cassert>
 #include <cstdio>
-#include <cstdlib>
 #include <vector>
 
 #include "streamexecutor/StreamExecutor.h"
 
-/// [Example saxpy compiler-generated]
-// Code in this namespace is generated by the compiler (e.g. clang).
-//
-// The name of this namespace may depend on the compiler that generated it, so
-// this is just an example name.
-namespace __compilergen {
+void Saxpy(float A, float *X, float *Y, size_t N) {
+  for (size_t I = 0; I < N; ++I)
+    X[I] = A * X[I] + Y[I];
+}
 
-// Specialization of the streamexecutor::Kernel template class for the parameter
-// types of the saxpy(float A, float *X, float *Y) kernel.
+namespace __compilergen {
 using SaxpyKernel =
     streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
-                           streamexecutor::GlobalDeviceMemory<float>>;
-
-// A string containing the PTX code generated by the device compiler for the
-// saxpy kernel. String contents not shown here.
-extern const char *SaxpyPTX;
+                           streamexecutor::GlobalDeviceMemory<float>, size_t>;
+
+// Wrapper function converts argument addresses to arguments.
+void SaxpyWrapper(const void *const *ArgumentAddresses) {
+  Saxpy(*static_cast<const float *>(ArgumentAddresses[0]),
+        static_cast<float *>(const_cast<void *>(ArgumentAddresses[1])),
+        static_cast<float *>(const_cast<void *>(ArgumentAddresses[2])),
+        *static_cast<const size_t *>(ArgumentAddresses[3]));
+}
 
-// A global instance of a loader spec that knows how to load the code in the
-// SaxpyPTX string.
+// The wrapper function is what gets registered.
 static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
   streamexecutor::MultiKernelLoaderSpec Spec;
-  Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
+  Spec.addHostFunction("Saxpy", SaxpyWrapper);
   return Spec;
 }();
-
 } // namespace __compilergen
-/// [Example saxpy compiler-generated]
-
-/// [Example saxpy host PTX]
-// The PTX text for a saxpy kernel.
-const char *__compilergen::SaxpyPTX = R"(
-  .version 4.3
-  .target sm_20
-  .address_size 64
-
-  .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
-    .reg .f32 %AValue;
-    .reg .f32 %XValue;
-    .reg .f32 %YValue;
-    .reg .f32 %Result;
-
-    .reg .b64 %XBaseAddrGeneric;
-    .reg .b64 %YBaseAddrGeneric;
-    .reg .b64 %XBaseAddrGlobal;
-    .reg .b64 %YBaseAddrGlobal;
-    .reg .b64 %XAddr;
-    .reg .b64 %YAddr;
-    .reg .b64 %ThreadByteOffset;
-
-    .reg .b32 %TID;
-
-    ld.param.f32 %AValue, [A];
-    ld.param.u64 %XBaseAddrGeneric, [X];
-    ld.param.u64 %YBaseAddrGeneric, [Y];
-    cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
-    cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
-    mov.u32 %TID, %tid.x;
-    mul.wide.u32 %ThreadByteOffset, %TID, 4;
-    add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
-    add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
-    ld.global.f32 %XValue, [%XAddr];
-    ld.global.f32 %YValue, [%YAddr];
-    fma.rn.f32 %Result, %AValue, %XValue, %YValue;
-    st.global.f32 [%XAddr], %Result;
-    ret;
-  }
-)";
-/// [Example saxpy host PTX]
 
 int main() {
-  /// [Example saxpy host main]
   namespace se = ::streamexecutor;
   namespace cg = ::__compilergen;
 
@@ -105,7 +58,7 @@
 
   // Get a device object.
   se::Platform *Platform =
-      getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
+      getOrDie(se::PlatformManager::getPlatformByName("host"));
   if (Platform->getDeviceCount() == 0) {
     return EXIT_FAILURE;
   }
@@ -130,7 +83,7 @@
   se::Stream Stream = getOrDie(Device->createStream());
   Stream.thenCopyH2D(RegisteredX, X)
       .thenCopyH2D(RegisteredY, Y)
-      .thenLaunch(ArraySize, 1, Kernel, A, X, Y)
+      .thenLaunch(1, 1, Kernel, A, X, Y, ArraySize)
       .thenCopyD2H(X, RegisteredX);
   // Wait for the stream to complete.
   se::dieIfError(Stream.blockHostUntilDone());
@@ -138,5 +91,4 @@
   // Process output data in HostX.
   std::vector<float> ExpectedX = {4, 47, 90, 133};
   assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
-  /// [Example saxpy host main]
 }
Index: streamexecutor/include/streamexecutor/KernelSpec.h
===================================================================
--- streamexecutor/include/streamexecutor/KernelSpec.h
+++ streamexecutor/include/streamexecutor/KernelSpec.h
@@ -65,11 +65,13 @@
 #define STREAMEXECUTOR_KERNELSPEC_H
 
 #include <cassert>
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 
 namespace streamexecutor {
@@ -199,6 +201,9 @@
 /// than doing it by hand.
 class MultiKernelLoaderSpec {
 public:
+  /// Type of functions used as host platform kernels.
+  using HostFunctionTy = std::function<void(const void **)>;
+
   std::string getKernelName() const {
     if (TheKernelName) {
       return *TheKernelName;
@@ -216,6 +221,7 @@
   bool hasOpenCLTextInMemory() const {
     return TheOpenCLTextInMemorySpec != nullptr;
   }
+  bool hasHostFunction() const { return HostFunction != nullptr; }
 
   // Accessors for platform variant kernel load specifications.
   //
@@ -234,6 +240,11 @@
     return *TheOpenCLTextInMemorySpec;
   }
 
+  const HostFunctionTy &getHostFunction() const {
+    assert(hasHostFunction() && "getting spec that is not present");
+    return *HostFunction;
+  }
+
   // Builder-pattern-like methods for use in initializing a
   // MultiKernelLoaderSpec.
   //
@@ -257,6 +268,12 @@
   MultiKernelLoaderSpec &addOpenCLTextInMemory(llvm::StringRef KernelName,
                                                const char *OpenCLText);
 
+  MultiKernelLoaderSpec &addHostFunction(llvm::StringRef KernelName,
+                                         HostFunctionTy &&Function) {
+    HostFunction = llvm::make_unique<HostFunctionTy>(std::move(Function));
+    return *this;
+  }
+
 private:
   void setKernelName(llvm::StringRef KernelName);
 
@@ -264,6 +281,7 @@
   std::unique_ptr<CUDAPTXInMemorySpec> TheCUDAPTXInMemorySpec;
   std::unique_ptr<CUDAFatbinInMemorySpec> TheCUDAFatbinInMemorySpec;
   std::unique_ptr<OpenCLTextInMemorySpec> TheOpenCLTextInMemorySpec;
+  std::unique_ptr<HostFunctionTy> HostFunction;
 };
 
 } // namespace streamexecutor
Index: streamexecutor/include/streamexecutor/PlatformDevice.h
===================================================================
--- streamexecutor/include/streamexecutor/PlatformDevice.h
+++ streamexecutor/include/streamexecutor/PlatformDevice.h
@@ -149,10 +149,10 @@
   /// Similar to synchronousCopyD2H(const void *, size_t, void
   /// *, size_t, size_t), but copies memory from one location in device memory
   /// to another rather than from device to host.
-  virtual Error synchronousCopyD2D(const void *DeviceDstHandle,
-                                   size_t DstByteOffset,
-                                   const void *DeviceSrcHandle,
-                                   size_t SrcByteOffset, size_t ByteCount) {
+  virtual Error synchronousCopyD2D(const void *DeviceSrcHandle,
+                                   size_t SrcByteOffset,
+                                   const void *DeviceDstHandle,
+                                   size_t DstByteOffset, size_t ByteCount) {
     return make_error("synchronousCopyD2D not implemented for platform " +
                       getName());
   }
Index: streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h
===================================================================
--- /dev/null
+++ streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h
@@ -0,0 +1,56 @@
+//===-- HostPlatform.h - Host platform subclass -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the HostPlatform class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
+#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
+
+#include "HostPlatformDevice.h"
+#include "streamexecutor/Device.h"
+#include "streamexecutor/Platform.h"
+
+#include "llvm/Support/Mutex.h"
+
+namespace streamexecutor {
+namespace host {
+
+/// Platform that performs work on the host rather than offloading to an
+/// accelerator.
+class HostPlatform : public Platform {
+public:
+  size_t getDeviceCount() const override { return 1; }
+
+  Expected<Device *> getDevice(size_t DeviceIndex) override {
+    if (DeviceIndex != 0) {
+      return make_error(
+          "Requested device index " + llvm::Twine(DeviceIndex) +
+          " from host platform which only supports device index 0");
+    }
+    llvm::sys::ScopedLock Lock(Mutex);
+    if (!TheDevice) {
+      ThePlatformDevice = llvm::make_unique<HostPlatformDevice>();
+      TheDevice = llvm::make_unique<Device>(ThePlatformDevice.get());
+    }
+    return TheDevice.get();
+  }
+
+private:
+  llvm::sys::Mutex Mutex;
+  std::unique_ptr<HostPlatformDevice> ThePlatformDevice;
+  std::unique_ptr<Device> TheDevice;
+};
+
+} // namespace host
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
Index: streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
===================================================================
--- /dev/null
+++ streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
@@ -0,0 +1,151 @@
+//===-- HostPlatformDevice.h - HostPlatformDevice class ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the HostPlatformDevice class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
+#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
+
+#include <cstdlib>
+#include <cstring>
+
+#include "streamexecutor/PlatformDevice.h"
+
+namespace streamexecutor {
+namespace host {
+
+/// A concrete PlatformDevice subclass that performs its work on the host rather
+/// than offloading to an accelerator.
+class HostPlatformDevice : public PlatformDevice {
+public:
+  std::string getName() const override { return "host"; }
+
+  Expected<const void *>
+  createKernel(const MultiKernelLoaderSpec &Spec) override {
+    if (!Spec.hasHostFunction()) {
+      return make_error("no host implementation available for kernel " +
+                        Spec.getKernelName());
+    }
+    return static_cast<const void *>(&Spec.getHostFunction());
+  }
+
+  Error destroyKernel(const void *Handle) override { return Error::success(); }
+
+  Expected<const void *> createStream() override {
+    // TODO(jhen): Do something with threads to allow multiple streams.
+    return this;
+  }
+
+  Error destroyStream(const void *Handle) override { return Error::success(); }
+
+  Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize,
+               GridDimensions GridSize, const void *PKernelHandle,
+               const PackedKernelArgumentArrayBase &ArgumentArray) override {
+    // TODO(jhen): Can we do something with BlockSize and GridSize?
+    if (!(BlockSize.X == 1 && BlockSize.Y == 1 && BlockSize.Z == 1)) {
+      return make_error(
+          "Block dimensions were (" + llvm::Twine(BlockSize.X) + "," +
+          llvm::Twine(BlockSize.Y) + "," + llvm::Twine(BlockSize.Z) +
+          "), but only size (1,1,1) is permitted for this platform");
+    }
+    if (!(GridSize.X == 1 && GridSize.Y == 1 && GridSize.Z == 1)) {
+      return make_error(
+          "Grid dimensions were (" + llvm::Twine(GridSize.X) + "," +
+          llvm::Twine(GridSize.Y) + "," + llvm::Twine(GridSize.Z) +
+          "), but only size (1,1,1) is permitted for this platform");
+    }
+
+    (*static_cast<const std::function<void(const void *const *)> *>(
+        PKernelHandle))(ArgumentArray.getAddresses());
+    return Error::success();
+  }
+
+  Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+                size_t SrcByteOffset, void *HostDst, size_t DstByteOffset,
+                size_t ByteCount) override {
+    std::memcpy(offset(HostDst, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
+                size_t SrcByteOffset, const void *DeviceDstHandle,
+                size_t DstByteOffset, size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(HostSrc, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+                size_t SrcByteOffset, const void *DeviceDstHandle,
+                size_t DstByteOffset, size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error blockHostUntilDone(const void *PlatformStreamHandle) override {
+    // All host operations are synchronous anyway.
+    return Error::success();
+  }
+
+  Expected<void *> allocateDeviceMemory(size_t ByteCount) override {
+    return std::malloc(ByteCount);
+  }
+
+  Error freeDeviceMemory(const void *Handle) override {
+    std::free(const_cast<void *>(Handle));
+    return Error::success();
+  }
+
+  Error registerHostMemory(void *Memory, size_t ByteCount) override {
+    return Error::success();
+  }
+
+  Error unregisterHostMemory(const void *Memory) override {
+    return Error::success();
+  }
+
+  Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset,
+                           void *HostDst, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(HostDst, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
+                           const void *DeviceDstHandle, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(HostSrc, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error synchronousCopyD2D(const void *DeviceSrcHandle, size_t SrcByteOffset,
+                           const void *DeviceDstHandle, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+private:
+  static void *offset(const void *Base, size_t Offset) {
+    return const_cast<char *>(static_cast<const char *>(Base) + Offset);
+  }
+};
+
+} // namespace host
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
Index: streamexecutor/lib/PlatformManager.cpp
===================================================================
--- streamexecutor/lib/PlatformManager.cpp
+++ streamexecutor/lib/PlatformManager.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "streamexecutor/PlatformManager.h"
+#include "streamexecutor/platforms/host/HostPlatform.h"
 
 namespace streamexecutor {
 
@@ -23,6 +24,8 @@
   //    appropriate code to include here.
   //  * Use static initialization tricks to have platform libraries register
   //    themselves when they are loaded.
+
+  PlatformsByName.emplace("host", llvm::make_unique<host::HostPlatform>());
 }
 
 Expected<Platform *> PlatformManager::getPlatformByName(llvm::StringRef Name) {