Index: streamexecutor/examples/CMakeLists.txt =================================================================== --- streamexecutor/examples/CMakeLists.txt +++ streamexecutor/examples/CMakeLists.txt @@ -1,2 +1,5 @@ add_executable(cuda_saxpy_example CUDASaxpy.cpp) target_link_libraries(cuda_saxpy_example streamexecutor) + +add_executable(host_saxpy_example HostSaxpy.cpp) +target_link_libraries(host_saxpy_example streamexecutor) Index: streamexecutor/examples/CUDASaxpy.cpp =================================================================== --- streamexecutor/examples/CUDASaxpy.cpp +++ streamexecutor/examples/CUDASaxpy.cpp @@ -17,7 +17,6 @@ #include #include -#include #include #include Index: streamexecutor/examples/HostSaxpy.cpp =================================================================== --- streamexecutor/examples/HostSaxpy.cpp +++ streamexecutor/examples/HostSaxpy.cpp @@ -1,4 +1,4 @@ -//===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===// +//===-- HostSaxpy.cpp - Example of host saxpy with StreamExecutor API -----===// // // The LLVM Compiler Infrastructure // @@ -9,91 +9,44 @@ /// /// \file /// This file contains example code demonstrating the usage of the -/// StreamExecutor API. Snippets of this file will be included as code examples -/// in documentation. Taking these examples from a real source file guarantees -/// that the examples will always compile. +/// StreamExecutor API for a host platform. /// //===----------------------------------------------------------------------===// #include #include #include -#include #include #include "streamexecutor/StreamExecutor.h" -/// [Example saxpy compiler-generated] -// Code in this namespace is generated by the compiler (e.g. clang). -// -// The name of this namespace may depend on the compiler that generated it, so -// this is just an example name. -namespace __compilergen { +void Saxpy(float A, float *X, float *Y, size_t N) { + for (size_t I = 0; I < N; ++I) + X[I] = A * X[I] + Y[I]; +} -// Specialization of the streamexecutor::Kernel template class for the parameter -// types of the saxpy(float A, float *X, float *Y) kernel. +namespace __compilergen { using SaxpyKernel = streamexecutor::Kernel, - streamexecutor::GlobalDeviceMemory>; - -// A string containing the PTX code generated by the device compiler for the -// saxpy kernel. String contents not shown here. -extern const char *SaxpyPTX; + streamexecutor::GlobalDeviceMemory, size_t>; + +// Wrapper function converts argument addresses to arguments. +void SaxpyWrapper(const void *const *ArgumentAddresses) { + Saxpy(*static_cast(ArgumentAddresses[0]), + static_cast(const_cast(ArgumentAddresses[1])), + static_cast(const_cast(ArgumentAddresses[2])), + *static_cast(ArgumentAddresses[3])); +} -// A global instance of a loader spec that knows how to load the code in the -// SaxpyPTX string. +// The wrapper function is what gets registered. static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { streamexecutor::MultiKernelLoaderSpec Spec; - Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}}); + Spec.addHostFunction("Saxpy", SaxpyWrapper); return Spec; }(); - } // namespace __compilergen -/// [Example saxpy compiler-generated] - -/// [Example saxpy host PTX] -// The PTX text for a saxpy kernel. -const char *__compilergen::SaxpyPTX = R"( - .version 4.3 - .target sm_20 - .address_size 64 - - .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) { - .reg .f32 %AValue; - .reg .f32 %XValue; - .reg .f32 %YValue; - .reg .f32 %Result; - - .reg .b64 %XBaseAddrGeneric; - .reg .b64 %YBaseAddrGeneric; - .reg .b64 %XBaseAddrGlobal; - .reg .b64 %YBaseAddrGlobal; - .reg .b64 %XAddr; - .reg .b64 %YAddr; - .reg .b64 %ThreadByteOffset; - - .reg .b32 %TID; - - ld.param.f32 %AValue, [A]; - ld.param.u64 %XBaseAddrGeneric, [X]; - ld.param.u64 %YBaseAddrGeneric, [Y]; - cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric; - cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric; - mov.u32 %TID, %tid.x; - mul.wide.u32 %ThreadByteOffset, %TID, 4; - add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal; - add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal; - ld.global.f32 %XValue, [%XAddr]; - ld.global.f32 %YValue, [%YAddr]; - fma.rn.f32 %Result, %AValue, %XValue, %YValue; - st.global.f32 [%XAddr], %Result; - ret; - } -)"; -/// [Example saxpy host PTX] int main() { - /// [Example saxpy host main] namespace se = ::streamexecutor; namespace cg = ::__compilergen; @@ -105,7 +58,7 @@ // Get a device object. se::Platform *Platform = - getOrDie(se::PlatformManager::getPlatformByName("CUDA")); + getOrDie(se::PlatformManager::getPlatformByName("host")); if (Platform->getDeviceCount() == 0) { return EXIT_FAILURE; } @@ -130,7 +83,7 @@ se::Stream Stream = getOrDie(Device->createStream()); Stream.thenCopyH2D(RegisteredX, X) .thenCopyH2D(RegisteredY, Y) - .thenLaunch(ArraySize, 1, Kernel, A, X, Y) + .thenLaunch(1, 1, Kernel, A, X, Y, ArraySize) .thenCopyD2H(X, RegisteredX); // Wait for the stream to complete. se::dieIfError(Stream.blockHostUntilDone()); @@ -138,5 +91,4 @@ // Process output data in HostX. std::vector ExpectedX = {4, 47, 90, 133}; assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin())); - /// [Example saxpy host main] } Index: streamexecutor/include/streamexecutor/KernelSpec.h =================================================================== --- streamexecutor/include/streamexecutor/KernelSpec.h +++ streamexecutor/include/streamexecutor/KernelSpec.h @@ -65,11 +65,13 @@ #define STREAMEXECUTOR_KERNELSPEC_H #include +#include #include #include #include #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" namespace streamexecutor { @@ -199,6 +201,9 @@ /// than doing it by hand. class MultiKernelLoaderSpec { public: + /// Type of functions used as host platform kernels. + using HostFunctionTy = std::function; + std::string getKernelName() const { if (TheKernelName) { return *TheKernelName; @@ -216,6 +221,7 @@ bool hasOpenCLTextInMemory() const { return TheOpenCLTextInMemorySpec != nullptr; } + bool hasHostFunction() const { return HostFunction != nullptr; } // Accessors for platform variant kernel load specifications. // @@ -234,6 +240,11 @@ return *TheOpenCLTextInMemorySpec; } + const HostFunctionTy &getHostFunction() const { + assert(hasHostFunction() && "getting spec that is not present"); + return *HostFunction; + } + // Builder-pattern-like methods for use in initializing a // MultiKernelLoaderSpec. // @@ -257,6 +268,12 @@ MultiKernelLoaderSpec &addOpenCLTextInMemory(llvm::StringRef KernelName, const char *OpenCLText); + MultiKernelLoaderSpec &addHostFunction(llvm::StringRef KernelName, + HostFunctionTy Function) { + HostFunction = llvm::make_unique(std::move(Function)); + return *this; + } + private: void setKernelName(llvm::StringRef KernelName); @@ -264,6 +281,7 @@ std::unique_ptr TheCUDAPTXInMemorySpec; std::unique_ptr TheCUDAFatbinInMemorySpec; std::unique_ptr TheOpenCLTextInMemorySpec; + std::unique_ptr HostFunction; }; } // namespace streamexecutor Index: streamexecutor/include/streamexecutor/PlatformDevice.h =================================================================== --- streamexecutor/include/streamexecutor/PlatformDevice.h +++ streamexecutor/include/streamexecutor/PlatformDevice.h @@ -149,10 +149,10 @@ /// Similar to synchronousCopyD2H(const void *, size_t, void /// *, size_t, size_t), but copies memory from one location in device memory /// to another rather than from device to host. - virtual Error synchronousCopyD2D(const void *DeviceDstHandle, - size_t DstByteOffset, - const void *DeviceSrcHandle, - size_t SrcByteOffset, size_t ByteCount) { + virtual Error synchronousCopyD2D(const void *DeviceSrcHandle, + size_t SrcByteOffset, + const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) { return make_error("synchronousCopyD2D not implemented for platform " + getName()); } Index: streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h =================================================================== --- /dev/null +++ streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h @@ -0,0 +1,56 @@ +//===-- HostPlatform.h - Host platform subclass -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Declaration of the HostPlatform class. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H +#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H + +#include "HostPlatformDevice.h" +#include "streamexecutor/Device.h" +#include "streamexecutor/Platform.h" + +#include "llvm/Support/Mutex.h" + +namespace streamexecutor { +namespace host { + +/// Platform that performs work on the host rather than offloading to an +/// accelerator. +class HostPlatform : public Platform { +public: + size_t getDeviceCount() const override { return 1; } + + Expected getDevice(size_t DeviceIndex) override { + if (DeviceIndex != 0) { + return make_error( + "Requested device index " + llvm::Twine(DeviceIndex) + + " from host platform which only supports device index 0"); + } + llvm::sys::ScopedLock Lock(Mutex); + if (!TheDevice) { + ThePlatformDevice = llvm::make_unique(); + TheDevice = llvm::make_unique(ThePlatformDevice.get()); + } + return TheDevice.get(); + } + +private: + llvm::sys::Mutex Mutex; + std::unique_ptr ThePlatformDevice; + std::unique_ptr TheDevice; +}; + +} // namespace host +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H Index: streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h =================================================================== --- /dev/null +++ streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h @@ -0,0 +1,151 @@ +//===-- HostPlatformDevice.h - HostPlatformDevice class ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Declaration of the HostPlatformDevice class. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H +#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H + +#include +#include + +#include "streamexecutor/PlatformDevice.h" + +namespace streamexecutor { +namespace host { + +/// A concrete PlatformDevice subclass that performs its work on the host rather +/// than offloading to an accelerator. +class HostPlatformDevice : public PlatformDevice { +public: + std::string getName() const override { return "host"; } + + Expected + createKernel(const MultiKernelLoaderSpec &Spec) override { + if (!Spec.hasHostFunction()) { + return make_error("no host implementation available for kernel " + + Spec.getKernelName()); + } + return static_cast(&Spec.getHostFunction()); + } + + Error destroyKernel(const void *Handle) override { return Error::success(); } + + Expected createStream() override { + // TODO(jhen): Do something with threads to allow multiple streams. + return this; + } + + Error destroyStream(const void *Handle) override { return Error::success(); } + + Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize, + GridDimensions GridSize, const void *PKernelHandle, + const PackedKernelArgumentArrayBase &ArgumentArray) override { + // TODO(jhen): Can we do something with BlockSize and GridSize? + if (!(BlockSize.X == 1 && BlockSize.Y == 1 && BlockSize.Z == 1)) { + return make_error( + "Block dimensions were (" + llvm::Twine(BlockSize.X) + "," + + llvm::Twine(BlockSize.Y) + "," + llvm::Twine(BlockSize.Z) + + "), but only size (1,1,1) is permitted for this platform"); + } + if (!(GridSize.X == 1 && GridSize.Y == 1 && GridSize.Z == 1)) { + return make_error( + "Grid dimensions were (" + llvm::Twine(GridSize.X) + "," + + llvm::Twine(GridSize.Y) + "," + llvm::Twine(GridSize.Z) + + "), but only size (1,1,1) is permitted for this platform"); + } + + (*static_cast *>( + PKernelHandle))(ArgumentArray.getAddresses()); + return Error::success(); + } + + Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle, + size_t SrcByteOffset, void *HostDst, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(HostDst, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc, + size_t SrcByteOffset, const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(HostSrc, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle, + size_t SrcByteOffset, const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error blockHostUntilDone(const void *PlatformStreamHandle) override { + // All host operations are synchronous anyway. + return Error::success(); + } + + Expected allocateDeviceMemory(size_t ByteCount) override { + return std::malloc(ByteCount); + } + + Error freeDeviceMemory(const void *Handle) override { + std::free(const_cast(Handle)); + return Error::success(); + } + + Error registerHostMemory(void *Memory, size_t ByteCount) override { + return Error::success(); + } + + Error unregisterHostMemory(const void *Memory) override { + return Error::success(); + } + + Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset, + void *HostDst, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(HostDst, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset, + const void *DeviceDstHandle, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(HostSrc, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error synchronousCopyD2D(const void *DeviceSrcHandle, size_t SrcByteOffset, + const void *DeviceDstHandle, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + +private: + static void *offset(const void *Base, size_t Offset) { + return const_cast(static_cast(Base) + Offset); + } +}; + +} // namespace host +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H Index: streamexecutor/lib/PlatformManager.cpp =================================================================== --- streamexecutor/lib/PlatformManager.cpp +++ streamexecutor/lib/PlatformManager.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "streamexecutor/PlatformManager.h" +#include "streamexecutor/platforms/host/HostPlatform.h" namespace streamexecutor { @@ -23,6 +24,8 @@ // appropriate code to include here. // * Use static initialization tricks to have platform libraries register // themselves when they are loaded. + + PlatformsByName.emplace("host", llvm::make_unique()); } Expected PlatformManager::getPlatformByName(llvm::StringRef Name) {