Index: parallel-libs/trunk/streamexecutor/CMakeLists.txt
===================================================================
--- parallel-libs/trunk/streamexecutor/CMakeLists.txt
+++ parallel-libs/trunk/streamexecutor/CMakeLists.txt
@@ -62,6 +62,7 @@
 set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-unused-parameter")
 
 add_subdirectory(lib)
+add_subdirectory(examples)
 
 if (STREAM_EXECUTOR_ENABLE_DOXYGEN)
     find_package(Doxygen REQUIRED)
Index: parallel-libs/trunk/streamexecutor/Doxyfile.in
===================================================================
--- parallel-libs/trunk/streamexecutor/Doxyfile.in
+++ parallel-libs/trunk/streamexecutor/Doxyfile.in
@@ -398,7 +398,7 @@
 # normally produced when WARNINGS is set to YES.
 # The default value is: NO.
 
-EXTRACT_ALL            = NO
+EXTRACT_ALL            = YES
 
 # If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
 # be included in the documentation.
@@ -811,7 +811,7 @@
 # that contain example code fragments that are included (see the \include
 # command).
 
-EXAMPLE_PATH           =
+EXAMPLE_PATH           = @CMAKE_CURRENT_SOURCE_DIR@
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
Index: parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt
===================================================================
--- parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt
+++ parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(example Example.cpp)
+target_link_libraries(example streamexecutor)
Index: parallel-libs/trunk/streamexecutor/examples/Example.cpp
===================================================================
--- parallel-libs/trunk/streamexecutor/examples/Example.cpp
+++ parallel-libs/trunk/streamexecutor/examples/Example.cpp
@@ -0,0 +1,163 @@
+//===-- Example.cpp - Example code for documentation ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains example code demonstrating the usage of the
+/// StreamExecutor API. Snippets of this file will be included as code examples
+/// in documentation. Taking these examples from a real source file guarantees
+/// that the examples will always compile.
+///
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <vector>
+
+#include "streamexecutor/StreamExecutor.h"
+
+/// [Example saxpy host helper functions]
+// Example handler for streamexecutor::Expected return values.
+template <typename T> T getOrDie(streamexecutor::Expected<T> &&E) {
+  if (!E) {
+    std::fprintf(stderr, "Error extracting an expected value: %s.\n",
+                 streamexecutor::consumeAndGetMessage(E.takeError()).c_str());
+    std::exit(EXIT_FAILURE);
+  }
+  return std::move(*E);
+}
+
+// Example handler for streamexecutor::Error return values.
+void check(streamexecutor::Error &&E) {
+  if (E) {
+    std::fprintf(stderr, "Error encountered: %s.\n",
+                 streamexecutor::consumeAndGetMessage(std::move(E)).c_str());
+    std::exit(EXIT_FAILURE);
+  }
+}
+/// [Example saxpy host helper functions]
+
+/// [Example saxpy compiler-generated]
+// Code in this namespace is generated by the compiler (e.g. clang).
+//
+// The name of this namespace may depend on the compiler that generated it, so
+// this is just an example name.
+namespace __compilergen {
+
+// Specialization of the streamexecutor::Kernel template class for the parameter
+// types of the saxpy(float A, float *X, float *Y) kernel.
+using SaxpyKernel =
+    streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
+                           streamexecutor::GlobalDeviceMemory<float>>;
+
+// A string containing the PTX code generated by the device compiler for the
+// saxpy kernel. String contents not shown here.
+extern const char *SaxpyPTX;
+
+// A global instance of a loader spec that knows how to load the code in the
+// SaxpyPTX string.
+static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
+  streamexecutor::MultiKernelLoaderSpec Spec;
+  Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
+  return Spec;
+}();
+
+} // namespace __compilergen
+/// [Example saxpy compiler-generated]
+
+/// [Example saxpy host PTX]
+const char *__compilergen::SaxpyPTX = R"(
+  .version 4.3
+  .target sm_20
+  .address_size 64
+
+  .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
+    .reg .f32 %AValue;
+    .reg .f32 %XValue;
+    .reg .f32 %YValue;
+    .reg .f32 %Result;
+
+    .reg .b64 %XBaseAddrGeneric;
+    .reg .b64 %YBaseAddrGeneric;
+    .reg .b64 %XBaseAddrGlobal;
+    .reg .b64 %YBaseAddrGlobal;
+    .reg .b64 %XAddr;
+    .reg .b64 %YAddr;
+    .reg .b64 %ThreadByteOffset;
+
+    .reg .b32 %TID;
+
+    ld.param.f32 %AValue, [A];
+    ld.param.u64 %XBaseAddrGeneric, [X];
+    ld.param.u64 %YBaseAddrGeneric, [Y];
+    cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
+    cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
+    mov.u32 %TID, %tid.x;
+    mul.wide.u32 %ThreadByteOffset, %TID, 4;
+    add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
+    add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
+    ld.global.f32 %XValue, [%XAddr];
+    ld.global.f32 %YValue, [%YAddr];
+    fma.rn.f32 %Result, %AValue, %XValue, %YValue;
+    st.global.f32 [%XAddr], %Result;
+    ret;
+  }
+)";
+/// [Example saxpy host PTX]
+
+int main() {
+  /// [Example saxpy host main]
+  namespace se = ::streamexecutor;
+  namespace cg = ::__compilergen;
+
+  // Create some host data.
+  float A = 42.0f;
+  std::vector<float> HostX = {0, 1, 2, 3};
+  std::vector<float> HostY = {4, 5, 6, 7};
+  size_t ArraySize = HostX.size();
+
+  // Get a device object.
+  se::Platform *Platform =
+      getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
+  if (Platform->getDeviceCount() == 0) {
+    return EXIT_FAILURE;
+  }
+  se::Device *Device = getOrDie(Platform->getDevice(0));
+
+  // Load the kernel onto the device.
+  std::unique_ptr<cg::SaxpyKernel> Kernel =
+      getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
+
+  // Allocate memory on the device.
+  se::GlobalDeviceMemory<float> X =
+      getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
+  se::GlobalDeviceMemory<float> Y =
+      getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
+
+  // Run operations on a stream.
+  std::unique_ptr<se::Stream> Stream = getOrDie(Device->createStream());
+  Stream->thenCopyH2D<float>(HostX, X)
+      .thenCopyH2D<float>(HostY, Y)
+      .thenLaunch(ArraySize, 1, *Kernel, A, X, Y)
+      .thenCopyD2H<float>(X, HostX);
+  // Wait for the stream to complete.
+  check(Stream->blockHostUntilDone());
+
+  // Process output data in HostX.
+  std::vector<float> ExpectedX = {4, 47, 90, 133};
+  for (size_t I = 0; I < ArraySize; ++I) {
+    assert(HostX[I] == ExpectedX[I]);
+  }
+
+  // Free device memory.
+  check(Device->freeDeviceMemory(X));
+  check(Device->freeDeviceMemory(Y));
+  /// [Example saxpy host main]
+}
Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/Kernel.h
===================================================================
--- parallel-libs/trunk/streamexecutor/include/streamexecutor/Kernel.h
+++ parallel-libs/trunk/streamexecutor/include/streamexecutor/Kernel.h
@@ -55,7 +55,7 @@
 /// function as follows:
 /// \code
 ///     namespace ccn = compiler_cuda_namespace;
-///     using KernelPtr = std::unique_ptr<cnn::SaxpyKernel>;
+///     using KernelPtr = std::unique_ptr<ccn::SaxpyKernel>;
 ///     // Assumes Device is a pointer to the Device on which to launch the
 ///     // kernel.
 ///     //
Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h
===================================================================
--- parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h
+++ parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h
@@ -115,8 +115,9 @@
   /// Adds each item in SpecList to this object.
   ///
   /// Does not take ownership of the PTXCode pointers in the SpecList elements.
-  CUDAPTXInMemorySpec(llvm::StringRef KernelName,
-                      const llvm::ArrayRef<PTXSpec> SpecList);
+  CUDAPTXInMemorySpec(
+      llvm::StringRef KernelName,
+      const llvm::ArrayRef<CUDAPTXInMemorySpec::PTXSpec> SpecList);
 
   /// Returns a pointer to the PTX code for the requested compute capability.
   ///
Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/StreamExecutor.h
===================================================================
--- parallel-libs/trunk/streamexecutor/include/streamexecutor/StreamExecutor.h
+++ parallel-libs/trunk/streamexecutor/include/streamexecutor/StreamExecutor.h
@@ -0,0 +1,71 @@
+//===-- StreamExecutor.h - Main include file for StreamExecutor -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/// \mainpage Getting Started
+///
+/// \b StreamExecutor is a wrapper around CUDA and OpenCL host-side programming
+/// models (runtimes). This abstraction cleanly permits host code to target
+/// either CUDA or OpenCL devices with identically-functioning data parallel
+/// kernels. It manages the execution of concurrent work targeting the
+/// accelerator, similar to a host-side Executor.
+///
+/// This version of StreamExecutor can be built either as a sub-project of the
+/// LLVM project or as a standalone project depending on LLVM as an external
+/// package.
+///
+/// Below is an example of the use of the StreamExecutor API:
+///
+/// \snippet examples/Example.cpp Example saxpy host main
+///
+/// In the example, a couple of handler functions are used to handle error
+/// return values in the StreamExecutor API:
+///
+/// \snippet examples/Example.cpp Example saxpy host helper functions
+///
+/// These are just example handler functions. A real application will likely
+/// want to define similar handlers of its own that log errors in an
+/// application-specific way, convert errors to the application's own
+/// error-handling framework, or try to recover from errors as appropriate.
+///
+/// The example also references some symbols from a compiler-generated
+/// namespace:
+///
+/// \snippet examples/Example.cpp Example saxpy compiler-generated
+///
+/// Instead of depending on the compiler to generate this code, you can
+/// technically write the code yourself, but this is not recommended because the
+/// code is very error-prone. For example, the template parameters for the
+/// Kernel specialization have to match the parameter types for the device
+/// kernel, and the MultiKernelLoaderSpec has to be initialized with valid
+/// device code for the kernel. Errors in this code will not show up until
+/// runtime, and may only show up as garbage output rather than an explicit
+/// error, which can be very hard to debug, so again, it is strongly advised not
+/// to write this code yourself.
+///
+/// The example compiler-generated code uses a PTX string in the source code to
+/// store the device code, but the device code can also be stored in other
+/// formats such as CUDA cubin and CUDA fatbin. Furthermore, device code can be
+/// stored for other platforms such as OpenCL, and StreamExecutor will pick the
+/// right device code at runtime based on the user's platform selection. See
+/// streamexecutor::MultiKernelLoaderSpec for details of how device code can be
+/// stored for different platforms, but again, the code to set up the
+/// MultiKernelLoaderSpec instance should be generated by the compiler if
+/// possible, not by the user.
+
+#ifndef STREAMEXECUTOR_STREAMEXECUTOR_H
+#define STREAMEXECUTOR_STREAMEXECUTOR_H
+
+#include "Device.h"
+#include "Kernel.h"
+#include "KernelSpec.h"
+#include "Platform.h"
+#include "PlatformManager.h"
+#include "Stream.h"
+
+#endif // STREAMEXECUTOR_STREAMEXECUTOR_H