Index: parallel-libs/trunk/streamexecutor/Doxyfile.in =================================================================== --- parallel-libs/trunk/streamexecutor/Doxyfile.in +++ parallel-libs/trunk/streamexecutor/Doxyfile.in @@ -811,7 +811,7 @@ # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = @CMAKE_CURRENT_SOURCE_DIR@ +EXAMPLE_PATH = @CMAKE_CURRENT_SOURCE_DIR@/examples # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and @@ -825,7 +825,7 @@ # irrespective of the value of the RECURSIVE tag. # The default value is: NO. -EXAMPLE_RECURSIVE = NO +EXAMPLE_RECURSIVE = YES # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the @@ -983,7 +983,7 @@ # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. -COLS_IN_ALPHA_INDEX = 5 +COLS_IN_ALPHA_INDEX = 1 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag @@ -1068,7 +1068,7 @@ # see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = +HTML_EXTRA_STYLESHEET = @CMAKE_CURRENT_SOURCE_DIR@/customdoxygen.css # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note @@ -1352,7 +1352,7 @@ # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. -GENERATE_TREEVIEW = NO +GENERATE_TREEVIEW = YES # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. Index: parallel-libs/trunk/streamexecutor/customdoxygen.css =================================================================== --- parallel-libs/trunk/streamexecutor/customdoxygen.css +++ parallel-libs/trunk/streamexecutor/customdoxygen.css @@ -0,0 +1,20 @@ +body { + background-color: #e0e0eb; +} + +div.header { + margin-left: auto; + margin-right: auto; + max-width: 60em; + padding-left: 2em; + padding-right: 2em; +} + +div.contents { + margin-left: auto; + margin-right: auto; + max-width: 60em; + background-color: white; + padding: 2em; + border-radius: 1em; +} Index: parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt =================================================================== --- parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt +++ parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt @@ -1,2 +1,2 @@ -add_executable(example Example.cpp) -target_link_libraries(example streamexecutor) +add_executable(cuda_saxpy_example CUDASaxpy.cpp) +target_link_libraries(cuda_saxpy_example streamexecutor) Index: parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp +++ parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp @@ -0,0 +1,137 @@ +//===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains example code demonstrating the usage of the +/// StreamExecutor API. Snippets of this file will be included as code examples +/// in documentation. Taking these examples from a real source file guarantees +/// that the examples will always compile. +/// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include "streamexecutor/StreamExecutor.h" + +/// [Example saxpy compiler-generated] +// Code in this namespace is generated by the compiler (e.g. clang). +// +// The name of this namespace may depend on the compiler that generated it, so +// this is just an example name. +namespace __compilergen { + +// Specialization of the streamexecutor::Kernel template class for the parameter +// types of the saxpy(float A, float *X, float *Y) kernel. +using SaxpyKernel = + streamexecutor::Kernel, + streamexecutor::GlobalDeviceMemory>; + +// A string containing the PTX code generated by the device compiler for the +// saxpy kernel. String contents not shown here. +extern const char *SaxpyPTX; + +// A global instance of a loader spec that knows how to load the code in the +// SaxpyPTX string. +static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { + streamexecutor::MultiKernelLoaderSpec Spec; + Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}}); + return Spec; +}(); + +} // namespace __compilergen +/// [Example saxpy compiler-generated] + +/// [Example saxpy host PTX] +// The PTX text for a saxpy kernel. +const char *__compilergen::SaxpyPTX = R"( + .version 4.3 + .target sm_20 + .address_size 64 + + .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) { + .reg .f32 %AValue; + .reg .f32 %XValue; + .reg .f32 %YValue; + .reg .f32 %Result; + + .reg .b64 %XBaseAddrGeneric; + .reg .b64 %YBaseAddrGeneric; + .reg .b64 %XBaseAddrGlobal; + .reg .b64 %YBaseAddrGlobal; + .reg .b64 %XAddr; + .reg .b64 %YAddr; + .reg .b64 %ThreadByteOffset; + + .reg .b32 %TID; + + ld.param.f32 %AValue, [A]; + ld.param.u64 %XBaseAddrGeneric, [X]; + ld.param.u64 %YBaseAddrGeneric, [Y]; + cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric; + cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric; + mov.u32 %TID, %tid.x; + mul.wide.u32 %ThreadByteOffset, %TID, 4; + add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal; + add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal; + ld.global.f32 %XValue, [%XAddr]; + ld.global.f32 %YValue, [%YAddr]; + fma.rn.f32 %Result, %AValue, %XValue, %YValue; + st.global.f32 [%XAddr], %Result; + ret; + } +)"; +/// [Example saxpy host PTX] + +int main() { + /// [Example saxpy host main] + namespace se = ::streamexecutor; + namespace cg = ::__compilergen; + + // Create some host data. + float A = 42.0f; + std::vector HostX = {0, 1, 2, 3}; + std::vector HostY = {4, 5, 6, 7}; + size_t ArraySize = HostX.size(); + + // Get a device object. + se::Platform *Platform = + getOrDie(se::PlatformManager::getPlatformByName("CUDA")); + if (Platform->getDeviceCount() == 0) { + return EXIT_FAILURE; + } + se::Device *Device = getOrDie(Platform->getDevice(0)); + + // Load the kernel onto the device. + cg::SaxpyKernel Kernel = + getOrDie(Device->createKernel(cg::SaxpyLoaderSpec)); + + // Allocate memory on the device. + se::GlobalDeviceMemory X = + getOrDie(Device->allocateDeviceMemory(ArraySize)); + se::GlobalDeviceMemory Y = + getOrDie(Device->allocateDeviceMemory(ArraySize)); + + // Run operations on a stream. + se::Stream Stream = getOrDie(Device->createStream()); + Stream.thenCopyH2D(HostX, X) + .thenCopyH2D(HostY, Y) + .thenLaunch(ArraySize, 1, Kernel, A, X, Y) + .thenCopyD2H(X, HostX); + // Wait for the stream to complete. + se::dieIfError(Stream.blockHostUntilDone()); + + // Process output data in HostX. + std::vector ExpectedX = {4, 47, 90, 133}; + assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin())); + /// [Example saxpy host main] +} Index: parallel-libs/trunk/streamexecutor/examples/Example.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/examples/Example.cpp +++ parallel-libs/trunk/streamexecutor/examples/Example.cpp @@ -1,137 +0,0 @@ -//===-- Example.cpp - Example code for documentation ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file contains example code demonstrating the usage of the -/// StreamExecutor API. Snippets of this file will be included as code examples -/// in documentation. Taking these examples from a real source file guarantees -/// that the examples will always compile. -/// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include - -#include "streamexecutor/StreamExecutor.h" - -/// [Example saxpy compiler-generated] -// Code in this namespace is generated by the compiler (e.g. clang). -// -// The name of this namespace may depend on the compiler that generated it, so -// this is just an example name. -namespace __compilergen { - -// Specialization of the streamexecutor::Kernel template class for the parameter -// types of the saxpy(float A, float *X, float *Y) kernel. -using SaxpyKernel = - streamexecutor::Kernel, - streamexecutor::GlobalDeviceMemory>; - -// A string containing the PTX code generated by the device compiler for the -// saxpy kernel. String contents not shown here. -extern const char *SaxpyPTX; - -// A global instance of a loader spec that knows how to load the code in the -// SaxpyPTX string. -static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { - streamexecutor::MultiKernelLoaderSpec Spec; - Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}}); - return Spec; -}(); - -} // namespace __compilergen -/// [Example saxpy compiler-generated] - -/// [Example saxpy host PTX] -const char *__compilergen::SaxpyPTX = R"( - .version 4.3 - .target sm_20 - .address_size 64 - - .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) { - .reg .f32 %AValue; - .reg .f32 %XValue; - .reg .f32 %YValue; - .reg .f32 %Result; - - .reg .b64 %XBaseAddrGeneric; - .reg .b64 %YBaseAddrGeneric; - .reg .b64 %XBaseAddrGlobal; - .reg .b64 %YBaseAddrGlobal; - .reg .b64 %XAddr; - .reg .b64 %YAddr; - .reg .b64 %ThreadByteOffset; - - .reg .b32 %TID; - - ld.param.f32 %AValue, [A]; - ld.param.u64 %XBaseAddrGeneric, [X]; - ld.param.u64 %YBaseAddrGeneric, [Y]; - cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric; - cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric; - mov.u32 %TID, %tid.x; - mul.wide.u32 %ThreadByteOffset, %TID, 4; - add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal; - add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal; - ld.global.f32 %XValue, [%XAddr]; - ld.global.f32 %YValue, [%YAddr]; - fma.rn.f32 %Result, %AValue, %XValue, %YValue; - st.global.f32 [%XAddr], %Result; - ret; - } -)"; -/// [Example saxpy host PTX] - -int main() { - /// [Example saxpy host main] - namespace se = ::streamexecutor; - namespace cg = ::__compilergen; - - // Create some host data. - float A = 42.0f; - std::vector HostX = {0, 1, 2, 3}; - std::vector HostY = {4, 5, 6, 7}; - size_t ArraySize = HostX.size(); - - // Get a device object. - se::Platform *Platform = - getOrDie(se::PlatformManager::getPlatformByName("CUDA")); - if (Platform->getDeviceCount() == 0) { - return EXIT_FAILURE; - } - se::Device *Device = getOrDie(Platform->getDevice(0)); - - // Load the kernel onto the device. - cg::SaxpyKernel Kernel = - getOrDie(Device->createKernel(cg::SaxpyLoaderSpec)); - - // Allocate memory on the device. - se::GlobalDeviceMemory X = - getOrDie(Device->allocateDeviceMemory(ArraySize)); - se::GlobalDeviceMemory Y = - getOrDie(Device->allocateDeviceMemory(ArraySize)); - - // Run operations on a stream. - se::Stream Stream = getOrDie(Device->createStream()); - Stream.thenCopyH2D(HostX, X) - .thenCopyH2D(HostY, Y) - .thenLaunch(ArraySize, 1, Kernel, A, X, Y) - .thenCopyD2H(X, HostX); - // Wait for the stream to complete. - se::dieIfError(Stream.blockHostUntilDone()); - - // Process output data in HostX. - std::vector ExpectedX = {4, 47, 90, 133}; - for (size_t I = 0; I < ArraySize; ++I) { - assert(HostX[I] == ExpectedX[I]); - } - /// [Example saxpy host main] -} Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/StreamExecutor.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/StreamExecutor.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/StreamExecutor.h @@ -7,8 +7,9 @@ // //===----------------------------------------------------------------------===// -/// \mainpage Getting Started +/// \mainpage Welcome to StreamExecutor /// +/// \section Introduction /// \b StreamExecutor is a wrapper around CUDA and OpenCL host-side programming /// models (runtimes). This abstraction cleanly permits host code to target /// either CUDA or OpenCL devices with identically-functioning data parallel @@ -19,9 +20,10 @@ /// LLVM project or as a standalone project depending on LLVM as an external /// package. /// +/// \subsection ExampleUsage Example Usage /// Below is an example of the use of the StreamExecutor API: /// -/// \snippet examples/Example.cpp Example saxpy host main +/// \snippet examples/CUDASaxpy.cpp Example saxpy host main /// /// In the example, a couple of handler functions, \c getOrDie and \c /// dieIfError, are used to handle error return values in the StreamExecutor @@ -30,10 +32,12 @@ /// versions of these handlers so that errors are handled more gracefully than /// just exiting the program. /// +/// \subsection CompilerGeneratedCode Compiler-Generated Code +/// /// The example also references some symbols from a compiler-generated /// namespace: /// -/// \snippet examples/Example.cpp Example saxpy compiler-generated +/// \snippet examples/CUDASaxpy.cpp Example saxpy compiler-generated /// /// Instead of depending on the compiler to generate this code, you can /// technically write the code yourself, but this is not recommended because the @@ -55,6 +59,9 @@ /// MultiKernelLoaderSpec instance should be generated by the compiler if /// possible, not by the user. +/// \example examples/CUDASaxpy.cpp +/// Running saxpy on a CUDA device. + #ifndef STREAMEXECUTOR_STREAMEXECUTOR_H #define STREAMEXECUTOR_STREAMEXECUTOR_H