Index: parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt =================================================================== --- parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt +++ parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt @@ -1,2 +1,5 @@ add_executable(cuda_saxpy_example CUDASaxpy.cpp) target_link_libraries(cuda_saxpy_example streamexecutor) + +add_executable(host_saxpy_example HostSaxpy.cpp) +target_link_libraries(host_saxpy_example streamexecutor) Index: parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp +++ parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp @@ -17,7 +17,6 @@ #include #include -#include #include #include Index: parallel-libs/trunk/streamexecutor/examples/HostSaxpy.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/examples/HostSaxpy.cpp +++ parallel-libs/trunk/streamexecutor/examples/HostSaxpy.cpp @@ -0,0 +1,94 @@ +//===-- HostSaxpy.cpp - Example of host saxpy with StreamExecutor API -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains example code demonstrating the usage of the +/// StreamExecutor API for a host platform. +/// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "streamexecutor/StreamExecutor.h" + +void Saxpy(float A, float *X, float *Y, size_t N) { + for (size_t I = 0; I < N; ++I) + X[I] = A * X[I] + Y[I]; +} + +namespace __compilergen { +using SaxpyKernel = + streamexecutor::Kernel, + streamexecutor::GlobalDeviceMemory, size_t>; + +// Wrapper function converts argument addresses to arguments. +void SaxpyWrapper(const void *const *ArgumentAddresses) { + Saxpy(*static_cast(ArgumentAddresses[0]), + static_cast(const_cast(ArgumentAddresses[1])), + static_cast(const_cast(ArgumentAddresses[2])), + *static_cast(ArgumentAddresses[3])); +} + +// The wrapper function is what gets registered. +static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { + streamexecutor::MultiKernelLoaderSpec Spec; + Spec.addHostFunction("Saxpy", SaxpyWrapper); + return Spec; +}(); +} // namespace __compilergen + +int main() { + namespace se = ::streamexecutor; + namespace cg = ::__compilergen; + + // Create some host data. + float A = 42.0f; + std::vector HostX = {0, 1, 2, 3}; + std::vector HostY = {4, 5, 6, 7}; + size_t ArraySize = HostX.size(); + + // Get a device object. + se::Platform *Platform = + getOrDie(se::PlatformManager::getPlatformByName("host")); + if (Platform->getDeviceCount() == 0) { + return EXIT_FAILURE; + } + se::Device *Device = getOrDie(Platform->getDevice(0)); + + // Load the kernel onto the device. + cg::SaxpyKernel Kernel = + getOrDie(Device->createKernel(cg::SaxpyLoaderSpec)); + + se::RegisteredHostMemory RegisteredX = + getOrDie(Device->registerHostMemory(HostX)); + se::RegisteredHostMemory RegisteredY = + getOrDie(Device->registerHostMemory(HostY)); + + // Allocate memory on the device. + se::GlobalDeviceMemory X = + getOrDie(Device->allocateDeviceMemory(ArraySize)); + se::GlobalDeviceMemory Y = + getOrDie(Device->allocateDeviceMemory(ArraySize)); + + // Run operations on a stream. + se::Stream Stream = getOrDie(Device->createStream()); + Stream.thenCopyH2D(RegisteredX, X) + .thenCopyH2D(RegisteredY, Y) + .thenLaunch(1, 1, Kernel, A, X, Y, ArraySize) + .thenCopyD2H(X, RegisteredX); + // Wait for the stream to complete. + se::dieIfError(Stream.blockHostUntilDone()); + + // Process output data in HostX. + std::vector ExpectedX = {4, 47, 90, 133}; + assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin())); +} Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h @@ -65,11 +65,13 @@ #define STREAMEXECUTOR_KERNELSPEC_H #include +#include #include #include #include #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" namespace streamexecutor { @@ -199,6 +201,9 @@ /// than doing it by hand. class MultiKernelLoaderSpec { public: + /// Type of functions used as host platform kernels. + using HostFunctionTy = std::function; + std::string getKernelName() const { if (TheKernelName) return *TheKernelName; @@ -215,6 +220,7 @@ bool hasOpenCLTextInMemory() const { return TheOpenCLTextInMemorySpec != nullptr; } + bool hasHostFunction() const { return HostFunction != nullptr; } // Accessors for platform variant kernel load specifications. // @@ -233,6 +239,11 @@ return *TheOpenCLTextInMemorySpec; } + const HostFunctionTy &getHostFunction() const { + assert(hasHostFunction() && "getting spec that is not present"); + return *HostFunction; + } + // Builder-pattern-like methods for use in initializing a // MultiKernelLoaderSpec. // @@ -256,6 +267,12 @@ MultiKernelLoaderSpec &addOpenCLTextInMemory(llvm::StringRef KernelName, const char *OpenCLText); + MultiKernelLoaderSpec &addHostFunction(llvm::StringRef KernelName, + HostFunctionTy Function) { + HostFunction = llvm::make_unique(std::move(Function)); + return *this; + } + private: void setKernelName(llvm::StringRef KernelName); @@ -263,6 +280,7 @@ std::unique_ptr TheCUDAPTXInMemorySpec; std::unique_ptr TheCUDAFatbinInMemorySpec; std::unique_ptr TheOpenCLTextInMemorySpec; + std::unique_ptr HostFunction; }; } // namespace streamexecutor Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h @@ -149,10 +149,10 @@ /// Similar to synchronousCopyD2H(const void *, size_t, void /// *, size_t, size_t), but copies memory from one location in device memory /// to another rather than from device to host. - virtual Error synchronousCopyD2D(const void *DeviceDstHandle, - size_t DstByteOffset, - const void *DeviceSrcHandle, - size_t SrcByteOffset, size_t ByteCount) { + virtual Error synchronousCopyD2D(const void *DeviceSrcHandle, + size_t SrcByteOffset, + const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) { return make_error("synchronousCopyD2D not implemented for platform " + getName()); } Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h @@ -0,0 +1,56 @@ +//===-- HostPlatform.h - Host platform subclass -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Declaration of the HostPlatform class. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H +#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H + +#include "HostPlatformDevice.h" +#include "streamexecutor/Device.h" +#include "streamexecutor/Platform.h" + +#include "llvm/Support/Mutex.h" + +namespace streamexecutor { +namespace host { + +/// Platform that performs work on the host rather than offloading to an +/// accelerator. +class HostPlatform : public Platform { +public: + size_t getDeviceCount() const override { return 1; } + + Expected getDevice(size_t DeviceIndex) override { + if (DeviceIndex != 0) { + return make_error( + "Requested device index " + llvm::Twine(DeviceIndex) + + " from host platform which only supports device index 0"); + } + llvm::sys::ScopedLock Lock(Mutex); + if (!TheDevice) { + ThePlatformDevice = llvm::make_unique(); + TheDevice = llvm::make_unique(ThePlatformDevice.get()); + } + return TheDevice.get(); + } + +private: + llvm::sys::Mutex Mutex; + std::unique_ptr ThePlatformDevice; + std::unique_ptr TheDevice; +}; + +} // namespace host +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h @@ -0,0 +1,151 @@ +//===-- HostPlatformDevice.h - HostPlatformDevice class ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Declaration of the HostPlatformDevice class. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H +#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H + +#include +#include + +#include "streamexecutor/PlatformDevice.h" + +namespace streamexecutor { +namespace host { + +/// A concrete PlatformDevice subclass that performs its work on the host rather +/// than offloading to an accelerator. +class HostPlatformDevice : public PlatformDevice { +public: + std::string getName() const override { return "host"; } + + Expected + createKernel(const MultiKernelLoaderSpec &Spec) override { + if (!Spec.hasHostFunction()) { + return make_error("no host implementation available for kernel " + + Spec.getKernelName()); + } + return static_cast(&Spec.getHostFunction()); + } + + Error destroyKernel(const void *Handle) override { return Error::success(); } + + Expected createStream() override { + // TODO(jhen): Do something with threads to allow multiple streams. + return this; + } + + Error destroyStream(const void *Handle) override { return Error::success(); } + + Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize, + GridDimensions GridSize, const void *PKernelHandle, + const PackedKernelArgumentArrayBase &ArgumentArray) override { + // TODO(jhen): Can we do something with BlockSize and GridSize? + if (!(BlockSize.X == 1 && BlockSize.Y == 1 && BlockSize.Z == 1)) { + return make_error( + "Block dimensions were (" + llvm::Twine(BlockSize.X) + "," + + llvm::Twine(BlockSize.Y) + "," + llvm::Twine(BlockSize.Z) + + "), but only size (1,1,1) is permitted for this platform"); + } + if (!(GridSize.X == 1 && GridSize.Y == 1 && GridSize.Z == 1)) { + return make_error( + "Grid dimensions were (" + llvm::Twine(GridSize.X) + "," + + llvm::Twine(GridSize.Y) + "," + llvm::Twine(GridSize.Z) + + "), but only size (1,1,1) is permitted for this platform"); + } + + (*static_cast *>( + PKernelHandle))(ArgumentArray.getAddresses()); + return Error::success(); + } + + Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle, + size_t SrcByteOffset, void *HostDst, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(HostDst, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc, + size_t SrcByteOffset, const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(HostSrc, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle, + size_t SrcByteOffset, const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error blockHostUntilDone(const void *PlatformStreamHandle) override { + // All host operations are synchronous anyway. + return Error::success(); + } + + Expected allocateDeviceMemory(size_t ByteCount) override { + return std::malloc(ByteCount); + } + + Error freeDeviceMemory(const void *Handle) override { + std::free(const_cast(Handle)); + return Error::success(); + } + + Error registerHostMemory(void *Memory, size_t ByteCount) override { + return Error::success(); + } + + Error unregisterHostMemory(const void *Memory) override { + return Error::success(); + } + + Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset, + void *HostDst, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(HostDst, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset, + const void *DeviceDstHandle, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(HostSrc, SrcByteOffset), ByteCount); + return Error::success(); + } + + Error synchronousCopyD2D(const void *DeviceSrcHandle, size_t SrcByteOffset, + const void *DeviceDstHandle, size_t DstByteOffset, + size_t ByteCount) override { + std::memcpy(offset(DeviceDstHandle, DstByteOffset), + offset(DeviceSrcHandle, SrcByteOffset), ByteCount); + return Error::success(); + } + +private: + static void *offset(const void *Base, size_t Offset) { + return const_cast(static_cast(Base) + Offset); + } +}; + +} // namespace host +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H Index: parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp +++ parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "streamexecutor/PlatformManager.h" +#include "streamexecutor/platforms/host/HostPlatform.h" namespace streamexecutor { @@ -23,6 +24,8 @@ // appropriate code to include here. // * Use static initialization tricks to have platform libraries register // themselves when they are loaded. + + PlatformsByName.emplace("host", llvm::make_unique()); } Expected PlatformManager::getPlatformByName(llvm::StringRef Name) {