Index: parallel-libs/trunk/streamexecutor/CMakeLists.txt =================================================================== --- parallel-libs/trunk/streamexecutor/CMakeLists.txt +++ parallel-libs/trunk/streamexecutor/CMakeLists.txt @@ -62,6 +62,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-unused-parameter") add_subdirectory(lib) +add_subdirectory(examples) if (STREAM_EXECUTOR_ENABLE_DOXYGEN) find_package(Doxygen REQUIRED) Index: parallel-libs/trunk/streamexecutor/Doxyfile.in =================================================================== --- parallel-libs/trunk/streamexecutor/Doxyfile.in +++ parallel-libs/trunk/streamexecutor/Doxyfile.in @@ -398,7 +398,7 @@ # normally produced when WARNINGS is set to YES. # The default value is: NO. -EXTRACT_ALL = NO +EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES all private members of a class will # be included in the documentation. @@ -811,7 +811,7 @@ # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = +EXAMPLE_PATH = @CMAKE_CURRENT_SOURCE_DIR@ # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and Index: parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt =================================================================== --- parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt +++ parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(example Example.cpp) +target_link_libraries(example streamexecutor) Index: parallel-libs/trunk/streamexecutor/examples/Example.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/examples/Example.cpp +++ parallel-libs/trunk/streamexecutor/examples/Example.cpp @@ -0,0 +1,163 @@ +//===-- Example.cpp - Example code for documentation ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains example code demonstrating the usage of the +/// StreamExecutor API. Snippets of this file will be included as code examples +/// in documentation. Taking these examples from a real source file guarantees +/// that the examples will always compile. +/// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include "streamexecutor/StreamExecutor.h" + +/// [Example saxpy host helper functions] +// Example handler for streamexecutor::Expected return values. +template T getOrDie(streamexecutor::Expected &&E) { + if (!E) { + std::fprintf(stderr, "Error extracting an expected value: %s.\n", + streamexecutor::consumeAndGetMessage(E.takeError()).c_str()); + std::exit(EXIT_FAILURE); + } + return std::move(*E); +} + +// Example handler for streamexecutor::Error return values. +void check(streamexecutor::Error &&E) { + if (E) { + std::fprintf(stderr, "Error encountered: %s.\n", + streamexecutor::consumeAndGetMessage(std::move(E)).c_str()); + std::exit(EXIT_FAILURE); + } +} +/// [Example saxpy host helper functions] + +/// [Example saxpy compiler-generated] +// Code in this namespace is generated by the compiler (e.g. clang). +// +// The name of this namespace may depend on the compiler that generated it, so +// this is just an example name. +namespace __compilergen { + +// Specialization of the streamexecutor::Kernel template class for the parameter +// types of the saxpy(float A, float *X, float *Y) kernel. +using SaxpyKernel = + streamexecutor::Kernel, + streamexecutor::GlobalDeviceMemory>; + +// A string containing the PTX code generated by the device compiler for the +// saxpy kernel. String contents not shown here. +extern const char *SaxpyPTX; + +// A global instance of a loader spec that knows how to load the code in the +// SaxpyPTX string. +static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { + streamexecutor::MultiKernelLoaderSpec Spec; + Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}}); + return Spec; +}(); + +} // namespace __compilergen +/// [Example saxpy compiler-generated] + +/// [Example saxpy host PTX] +const char *__compilergen::SaxpyPTX = R"( + .version 4.3 + .target sm_20 + .address_size 64 + + .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) { + .reg .f32 %AValue; + .reg .f32 %XValue; + .reg .f32 %YValue; + .reg .f32 %Result; + + .reg .b64 %XBaseAddrGeneric; + .reg .b64 %YBaseAddrGeneric; + .reg .b64 %XBaseAddrGlobal; + .reg .b64 %YBaseAddrGlobal; + .reg .b64 %XAddr; + .reg .b64 %YAddr; + .reg .b64 %ThreadByteOffset; + + .reg .b32 %TID; + + ld.param.f32 %AValue, [A]; + ld.param.u64 %XBaseAddrGeneric, [X]; + ld.param.u64 %YBaseAddrGeneric, [Y]; + cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric; + cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric; + mov.u32 %TID, %tid.x; + mul.wide.u32 %ThreadByteOffset, %TID, 4; + add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal; + add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal; + ld.global.f32 %XValue, [%XAddr]; + ld.global.f32 %YValue, [%YAddr]; + fma.rn.f32 %Result, %AValue, %XValue, %YValue; + st.global.f32 [%XAddr], %Result; + ret; + } +)"; +/// [Example saxpy host PTX] + +int main() { + /// [Example saxpy host main] + namespace se = ::streamexecutor; + namespace cg = ::__compilergen; + + // Create some host data. + float A = 42.0f; + std::vector HostX = {0, 1, 2, 3}; + std::vector HostY = {4, 5, 6, 7}; + size_t ArraySize = HostX.size(); + + // Get a device object. + se::Platform *Platform = + getOrDie(se::PlatformManager::getPlatformByName("CUDA")); + if (Platform->getDeviceCount() == 0) { + return EXIT_FAILURE; + } + se::Device *Device = getOrDie(Platform->getDevice(0)); + + // Load the kernel onto the device. + std::unique_ptr Kernel = + getOrDie(Device->createKernel(cg::SaxpyLoaderSpec)); + + // Allocate memory on the device. + se::GlobalDeviceMemory X = + getOrDie(Device->allocateDeviceMemory(ArraySize)); + se::GlobalDeviceMemory Y = + getOrDie(Device->allocateDeviceMemory(ArraySize)); + + // Run operations on a stream. + std::unique_ptr Stream = getOrDie(Device->createStream()); + Stream->thenCopyH2D(HostX, X) + .thenCopyH2D(HostY, Y) + .thenLaunch(ArraySize, 1, *Kernel, A, X, Y) + .thenCopyD2H(X, HostX); + // Wait for the stream to complete. + check(Stream->blockHostUntilDone()); + + // Process output data in HostX. + std::vector ExpectedX = {4, 47, 90, 133}; + for (size_t I = 0; I < ArraySize; ++I) { + assert(HostX[I] == ExpectedX[I]); + } + + // Free device memory. + check(Device->freeDeviceMemory(X)); + check(Device->freeDeviceMemory(Y)); + /// [Example saxpy host main] +} Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/Kernel.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/Kernel.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/Kernel.h @@ -55,7 +55,7 @@ /// function as follows: /// \code /// namespace ccn = compiler_cuda_namespace; -/// using KernelPtr = std::unique_ptr; +/// using KernelPtr = std::unique_ptr; /// // Assumes Device is a pointer to the Device on which to launch the /// // kernel. /// // Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h @@ -115,8 +115,9 @@ /// Adds each item in SpecList to this object. /// /// Does not take ownership of the PTXCode pointers in the SpecList elements. - CUDAPTXInMemorySpec(llvm::StringRef KernelName, - const llvm::ArrayRef SpecList); + CUDAPTXInMemorySpec( + llvm::StringRef KernelName, + const llvm::ArrayRef SpecList); /// Returns a pointer to the PTX code for the requested compute capability. /// Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/StreamExecutor.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/StreamExecutor.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/StreamExecutor.h @@ -0,0 +1,71 @@ +//===-- StreamExecutor.h - Main include file for StreamExecutor -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +/// \mainpage Getting Started +/// +/// \b StreamExecutor is a wrapper around CUDA and OpenCL host-side programming +/// models (runtimes). This abstraction cleanly permits host code to target +/// either CUDA or OpenCL devices with identically-functioning data parallel +/// kernels. It manages the execution of concurrent work targeting the +/// accelerator, similar to a host-side Executor. +/// +/// This version of StreamExecutor can be built either as a sub-project of the +/// LLVM project or as a standalone project depending on LLVM as an external +/// package. +/// +/// Below is an example of the use of the StreamExecutor API: +/// +/// \snippet examples/Example.cpp Example saxpy host main +/// +/// In the example, a couple of handler functions are used to handle error +/// return values in the StreamExecutor API: +/// +/// \snippet examples/Example.cpp Example saxpy host helper functions +/// +/// These are just example handler functions. A real application will likely +/// want to define similar handlers of its own that log errors in an +/// application-specific way, convert errors to the application's own +/// error-handling framework, or try to recover from errors as appropriate. +/// +/// The example also references some symbols from a compiler-generated +/// namespace: +/// +/// \snippet examples/Example.cpp Example saxpy compiler-generated +/// +/// Instead of depending on the compiler to generate this code, you can +/// technically write the code yourself, but this is not recommended because the +/// code is very error-prone. For example, the template parameters for the +/// Kernel specialization have to match the parameter types for the device +/// kernel, and the MultiKernelLoaderSpec has to be initialized with valid +/// device code for the kernel. Errors in this code will not show up until +/// runtime, and may only show up as garbage output rather than an explicit +/// error, which can be very hard to debug, so again, it is strongly advised not +/// to write this code yourself. +/// +/// The example compiler-generated code uses a PTX string in the source code to +/// store the device code, but the device code can also be stored in other +/// formats such as CUDA cubin and CUDA fatbin. Furthermore, device code can be +/// stored for other platforms such as OpenCL, and StreamExecutor will pick the +/// right device code at runtime based on the user's platform selection. See +/// streamexecutor::MultiKernelLoaderSpec for details of how device code can be +/// stored for different platforms, but again, the code to set up the +/// MultiKernelLoaderSpec instance should be generated by the compiler if +/// possible, not by the user. + +#ifndef STREAMEXECUTOR_STREAMEXECUTOR_H +#define STREAMEXECUTOR_STREAMEXECUTOR_H + +#include "Device.h" +#include "Kernel.h" +#include "KernelSpec.h" +#include "Platform.h" +#include "PlatformManager.h" +#include "Stream.h" + +#endif // STREAMEXECUTOR_STREAMEXECUTOR_H