|
| 1 | +//===-- Example.cpp - Example code for documentation ----------------------===// |
| 2 | +// |
| 3 | +// The LLVM Compiler Infrastructure |
| 4 | +// |
| 5 | +// This file is distributed under the University of Illinois Open Source |
| 6 | +// License. See LICENSE.TXT for details. |
| 7 | +// |
| 8 | +//===----------------------------------------------------------------------===// |
| 9 | +/// |
| 10 | +/// \file |
| 11 | +/// This file contains example code demonstrating the usage of the |
| 12 | +/// StreamExecutor API. Snippets of this file will be included as code examples |
| 13 | +/// in documentation. Taking these examples from a real source file guarantees |
| 14 | +/// that the examples will always compile. |
| 15 | +/// |
| 16 | +//===----------------------------------------------------------------------===// |
| 17 | + |
| 18 | +#include <cassert> |
| 19 | +#include <cstdio> |
| 20 | +#include <cstdlib> |
| 21 | +#include <memory> |
| 22 | +#include <vector> |
| 23 | + |
| 24 | +#include "streamexecutor/StreamExecutor.h" |
| 25 | + |
| 26 | +/// [Example saxpy host helper functions] |
| 27 | +// Example handler for streamexecutor::Expected return values. |
| 28 | +template <typename T> T getOrDie(streamexecutor::Expected<T> &&E) { |
| 29 | + if (!E) { |
| 30 | + std::fprintf(stderr, "Error extracting an expected value: %s.\n", |
| 31 | + streamexecutor::consumeAndGetMessage(E.takeError()).c_str()); |
| 32 | + std::exit(EXIT_FAILURE); |
| 33 | + } |
| 34 | + return std::move(*E); |
| 35 | +} |
| 36 | + |
| 37 | +// Example handler for streamexecutor::Error return values. |
| 38 | +void check(streamexecutor::Error &&E) { |
| 39 | + if (E) { |
| 40 | + std::fprintf(stderr, "Error encountered: %s.\n", |
| 41 | + streamexecutor::consumeAndGetMessage(std::move(E)).c_str()); |
| 42 | + std::exit(EXIT_FAILURE); |
| 43 | + } |
| 44 | +} |
| 45 | +/// [Example saxpy host helper functions] |
| 46 | + |
| 47 | +/// [Example saxpy compiler-generated] |
| 48 | +// Code in this namespace is generated by the compiler (e.g. clang). |
| 49 | +// |
| 50 | +// The name of this namespace may depend on the compiler that generated it, so |
| 51 | +// this is just an example name. |
| 52 | +namespace __compilergen { |
| 53 | + |
| 54 | +// Specialization of the streamexecutor::Kernel template class for the parameter |
| 55 | +// types of the saxpy(float A, float *X, float *Y) kernel. |
| 56 | +using SaxpyKernel = |
| 57 | + streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>, |
| 58 | + streamexecutor::GlobalDeviceMemory<float>>; |
| 59 | + |
| 60 | +// A string containing the PTX code generated by the device compiler for the |
| 61 | +// saxpy kernel. String contents not shown here. |
| 62 | +extern const char *SaxpyPTX; |
| 63 | + |
| 64 | +// A global instance of a loader spec that knows how to load the code in the |
| 65 | +// SaxpyPTX string. |
| 66 | +static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { |
| 67 | + streamexecutor::MultiKernelLoaderSpec Spec; |
| 68 | + Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}}); |
| 69 | + return Spec; |
| 70 | +}(); |
| 71 | + |
| 72 | +} // namespace __compilergen |
| 73 | +/// [Example saxpy compiler-generated] |
| 74 | + |
| 75 | +/// [Example saxpy host PTX] |
| 76 | +const char *__compilergen::SaxpyPTX = R"( |
| 77 | + .version 4.3 |
| 78 | + .target sm_20 |
| 79 | + .address_size 64 |
| 80 | +
|
| 81 | + .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) { |
| 82 | + .reg .f32 %AValue; |
| 83 | + .reg .f32 %XValue; |
| 84 | + .reg .f32 %YValue; |
| 85 | + .reg .f32 %Result; |
| 86 | +
|
| 87 | + .reg .b64 %XBaseAddrGeneric; |
| 88 | + .reg .b64 %YBaseAddrGeneric; |
| 89 | + .reg .b64 %XBaseAddrGlobal; |
| 90 | + .reg .b64 %YBaseAddrGlobal; |
| 91 | + .reg .b64 %XAddr; |
| 92 | + .reg .b64 %YAddr; |
| 93 | + .reg .b64 %ThreadByteOffset; |
| 94 | +
|
| 95 | + .reg .b32 %TID; |
| 96 | +
|
| 97 | + ld.param.f32 %AValue, [A]; |
| 98 | + ld.param.u64 %XBaseAddrGeneric, [X]; |
| 99 | + ld.param.u64 %YBaseAddrGeneric, [Y]; |
| 100 | + cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric; |
| 101 | + cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric; |
| 102 | + mov.u32 %TID, %tid.x; |
| 103 | + mul.wide.u32 %ThreadByteOffset, %TID, 4; |
| 104 | + add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal; |
| 105 | + add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal; |
| 106 | + ld.global.f32 %XValue, [%XAddr]; |
| 107 | + ld.global.f32 %YValue, [%YAddr]; |
| 108 | + fma.rn.f32 %Result, %AValue, %XValue, %YValue; |
| 109 | + st.global.f32 [%XAddr], %Result; |
| 110 | + ret; |
| 111 | + } |
| 112 | +)"; |
| 113 | +/// [Example saxpy host PTX] |
| 114 | + |
| 115 | +int main() { |
| 116 | + /// [Example saxpy host main] |
| 117 | + namespace se = ::streamexecutor; |
| 118 | + namespace cg = ::__compilergen; |
| 119 | + |
| 120 | + // Create some host data. |
| 121 | + float A = 42.0f; |
| 122 | + std::vector<float> HostX = {0, 1, 2, 3}; |
| 123 | + std::vector<float> HostY = {4, 5, 6, 7}; |
| 124 | + size_t ArraySize = HostX.size(); |
| 125 | + |
| 126 | + // Get a device object. |
| 127 | + se::Platform *Platform = |
| 128 | + getOrDie(se::PlatformManager::getPlatformByName("CUDA")); |
| 129 | + if (Platform->getDeviceCount() == 0) { |
| 130 | + return EXIT_FAILURE; |
| 131 | + } |
| 132 | + se::Device *Device = getOrDie(Platform->getDevice(0)); |
| 133 | + |
| 134 | + // Load the kernel onto the device. |
| 135 | + std::unique_ptr<cg::SaxpyKernel> Kernel = |
| 136 | + getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec)); |
| 137 | + |
| 138 | + // Allocate memory on the device. |
| 139 | + se::GlobalDeviceMemory<float> X = |
| 140 | + getOrDie(Device->allocateDeviceMemory<float>(ArraySize)); |
| 141 | + se::GlobalDeviceMemory<float> Y = |
| 142 | + getOrDie(Device->allocateDeviceMemory<float>(ArraySize)); |
| 143 | + |
| 144 | + // Run operations on a stream. |
| 145 | + std::unique_ptr<se::Stream> Stream = getOrDie(Device->createStream()); |
| 146 | + Stream->thenCopyH2D<float>(HostX, X) |
| 147 | + .thenCopyH2D<float>(HostY, Y) |
| 148 | + .thenLaunch(ArraySize, 1, *Kernel, A, X, Y) |
| 149 | + .thenCopyD2H<float>(X, HostX); |
| 150 | + // Wait for the stream to complete. |
| 151 | + check(Stream->blockHostUntilDone()); |
| 152 | + |
| 153 | + // Process output data in HostX. |
| 154 | + std::vector<float> ExpectedX = {4, 47, 90, 133}; |
| 155 | + for (size_t I = 0; I < ArraySize; ++I) { |
| 156 | + assert(HostX[I] == ExpectedX[I]); |
| 157 | + } |
| 158 | + |
| 159 | + // Free device memory. |
| 160 | + check(Device->freeDeviceMemory(X)); |
| 161 | + check(Device->freeDeviceMemory(Y)); |
| 162 | + /// [Example saxpy host main] |
| 163 | +} |
0 commit comments