Index: streamexecutor/CMakeLists.txt =================================================================== --- streamexecutor/CMakeLists.txt +++ streamexecutor/CMakeLists.txt @@ -3,9 +3,14 @@ option(STREAM_EXECUTOR_UNIT_TESTS "enable unit tests" ON) option(STREAM_EXECUTOR_ENABLE_DOXYGEN "enable StreamExecutor doxygen" ON) option(STREAM_EXECUTOR_ENABLE_CONFIG_TOOL "enable building streamexecutor-config tool" ON) +option(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM "enable building the CUDA StreamExecutor platform" OFF) + +configure_file("include/streamexecutor/PlatformOptions.h.in" "include/streamexecutor/PlatformOptions.h") # First find includes relative to the streamexecutor top-level source path. include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/include) +# Also look for configured headers in the top-level binary directory. +include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/include) # If we are not building as part of LLVM, build StreamExecutor as a standalone # project using LLVM as an external library: Index: streamexecutor/include/streamexecutor/PlatformOptions.h.in =================================================================== --- /dev/null +++ streamexecutor/include/streamexecutor/PlatformOptions.h.in @@ -0,0 +1,6 @@ +#ifndef STREAMEXECUTOR_PLATFORMOPTIONS_H +#define STREAMEXECUTOR_PLATFORMOPTIONS_H + +#cmakedefine STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM + +#endif // STREAMEXECUTOR_PLATFORMOPTIONS_H Index: streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatform.h =================================================================== --- /dev/null +++ streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatform.h @@ -0,0 +1,42 @@ +//===-- CUDAPlatform.h - CUDA platform subclass -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Declaration of the CUDAPlatform class. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H +#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H + +#include "streamexecutor/Platform.h" +#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h" + +#include "llvm/Support/Mutex.h" + +#include + +namespace streamexecutor { +namespace cuda { + +class CUDAPlatform : public Platform { +public: + size_t getDeviceCount() const override; + + Expected getDevice(size_t DeviceIndex) override; + +private: + llvm::sys::Mutex Mutex; + std::map PlatformDevices; +}; + +} // namespace cuda +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H Index: streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h =================================================================== --- /dev/null +++ streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h @@ -0,0 +1,91 @@ +//===-- CUDAPlatformDevice.h - CUDAPlatformDevice class ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Declaration of the CUDAPlatformDevice class. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H +#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H + +#include "streamexecutor/PlatformDevice.h" + +namespace streamexecutor { +namespace cuda { + +Error CUresultToError(int CUResult); + +class CUDAPlatformDevice : public PlatformDevice { +public: + static Expected create(size_t DeviceIndex); + + CUDAPlatformDevice(const CUDAPlatformDevice &) = delete; + CUDAPlatformDevice &operator=(const CUDAPlatformDevice &) = delete; + + CUDAPlatformDevice(CUDAPlatformDevice &&) noexcept; + CUDAPlatformDevice &operator=(CUDAPlatformDevice &&) noexcept; + + ~CUDAPlatformDevice() override; + + std::string getName() const override { return "CUDA"; } + + Expected + createKernel(const MultiKernelLoaderSpec &Spec) override; + Error destroyKernel(const void *Handle) override; + + Expected createStream() override; + Error destroyStream(const void *Handle) override; + + Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize, + GridDimensions GridSize, const void *PKernelHandle, + const PackedKernelArgumentArrayBase &ArgumentArray) override; + + Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle, + size_t SrcByteOffset, void *HostDst, size_t DstByteOffset, + size_t ByteCount) override; + + Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc, + size_t SrcByteOffset, const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) override; + + Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle, + size_t SrcByteOffset, const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) override; + + Error blockHostUntilDone(const void *PlatformStreamHandle) override; + + Expected allocateDeviceMemory(size_t ByteCount) override; + Error freeDeviceMemory(const void *Handle) override; + + Error registerHostMemory(void *Memory, size_t ByteCount) override; + Error unregisterHostMemory(const void *Memory) override; + + Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset, + void *HostDst, size_t DstByteOffset, + size_t ByteCount) override; + + Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset, + const void *DeviceDstHandle, size_t DstByteOffset, + size_t ByteCount) override; + + Error synchronousCopyD2D(const void *DeviceDstHandle, size_t DstByteOffset, + const void *DeviceSrcHandle, size_t SrcByteOffset, + size_t ByteCount) override; + +private: + CUDAPlatformDevice(size_t DeviceIndex) : DeviceIndex(DeviceIndex) {} + + int DeviceIndex; +}; + +} // namespace cuda +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H Index: streamexecutor/lib/CMakeLists.txt =================================================================== --- streamexecutor/lib/CMakeLists.txt +++ streamexecutor/lib/CMakeLists.txt @@ -3,6 +3,26 @@ set_target_properties(${name} PROPERTIES FOLDER "streamexecutor libraries") endmacro(add_se_library) +if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM) + set( + CMAKE_MODULE_PATH + ${CMAKE_MODULE_PATH} + "${CMAKE_CURRENT_SOURCE_DIR}/platforms/cuda/cmake/modules/") + + find_package(Libcuda REQUIRED) + include_directories(${LIBCUDA_INCLUDE_DIRS}) + + set( + STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT + $) + + set( + STREAM_EXECUTOR_LIBCUDA_LIBRARIES + ${LIBCUDA_LIBRARIES}) +endif(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM) + +add_subdirectory(platforms) + add_se_library( streamexecutor Device.cpp @@ -16,6 +36,8 @@ PlatformDevice.cpp PlatformManager.cpp Stream.cpp - ) + ${STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT} + LINK_LIBS + ${STREAM_EXECUTOR_LIBCUDA_LIBRARIES}) install(TARGETS streamexecutor DESTINATION lib) Index: streamexecutor/lib/PlatformManager.cpp =================================================================== --- streamexecutor/lib/PlatformManager.cpp +++ streamexecutor/lib/PlatformManager.cpp @@ -13,8 +13,14 @@ //===----------------------------------------------------------------------===// #include "streamexecutor/PlatformManager.h" + +#include "streamexecutor/PlatformOptions.h" #include "streamexecutor/platforms/host/HostPlatform.h" +#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM +#include "streamexecutor/platforms/cuda/CUDAPlatform.h" +#endif + namespace streamexecutor { PlatformManager::PlatformManager() { @@ -26,6 +32,10 @@ // themselves when they are loaded. PlatformsByName.emplace("host", llvm::make_unique()); + +#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM + PlatformsByName.emplace("cuda", llvm::make_unique()); +#endif } Expected PlatformManager::getPlatformByName(llvm::StringRef Name) { Index: streamexecutor/lib/platforms/CMakeLists.txt =================================================================== --- /dev/null +++ streamexecutor/lib/platforms/CMakeLists.txt @@ -0,0 +1,3 @@ +if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM) + add_subdirectory(cuda) +endif() Index: streamexecutor/lib/platforms/cuda/CMakeLists.txt =================================================================== --- /dev/null +++ streamexecutor/lib/platforms/cuda/CMakeLists.txt @@ -0,0 +1,5 @@ +add_library( + streamexecutor_cuda_platform + OBJECT + CUDAPlatform.cpp + CUDAPlatformDevice.cpp) Index: streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp =================================================================== --- /dev/null +++ streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp @@ -0,0 +1,63 @@ +//===-- CUDAPlatform.cpp - CUDA platform implementation -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Implementation of CUDA platform internals. +/// +//===----------------------------------------------------------------------===// + +#include "streamexecutor/platforms/cuda/CUDAPlatform.h" +#include "streamexecutor/Device.h" +#include "streamexecutor/Platform.h" +#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h" + +#include "llvm/Support/Mutex.h" + +#include "cuda.h" + +#include + +namespace streamexecutor { +namespace cuda { + +size_t CUDAPlatform::getDeviceCount() const { + static CUresult InitResult = []() { return cuInit(0); }(); + + if (InitResult) + // TODO(jhen): Log an error. + return 0; + + int DeviceCount = 0; + CUresult Result = cuDeviceGetCount(&DeviceCount); + // TODO(jhen): Log an error. + + return DeviceCount; +} + +Expected CUDAPlatform::getDevice(size_t DeviceIndex) { + static CUresult InitResult = []() { return cuInit(0); }(); + + if (InitResult) + return CUresultToError(InitResult); + + llvm::sys::ScopedLock Lock(Mutex); + auto Iterator = PlatformDevices.find(DeviceIndex); + if (Iterator == PlatformDevices.end()) { + if (auto MaybePDevice = CUDAPlatformDevice::create(DeviceIndex)) { + Iterator = + PlatformDevices.emplace(DeviceIndex, std::move(*MaybePDevice)).first; + } else { + return MaybePDevice.takeError(); + } + } + return Device(&Iterator->second); +} + +} // namespace cuda +} // namespace streamexecutor Index: streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp =================================================================== --- /dev/null +++ streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp @@ -0,0 +1,227 @@ +//===-- CUDAPlatformDevice.cpp - CUDAPlatformDevice implementation --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Implementation of CUDAPlatformDevice. +/// +//===----------------------------------------------------------------------===// + +#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h" +#include "streamexecutor/PlatformDevice.h" + +#include "cuda.h" + +namespace streamexecutor { +namespace cuda { + +static void *offset(const void *Base, size_t Offset) { + return const_cast(static_cast(Base) + Offset); +} + +Expected CUDAPlatformDevice::create(size_t DeviceIndex) { + CUdevice DeviceHandle; + if (CUresult Result = cuDeviceGet(&DeviceHandle, DeviceIndex)) + return CUresultToError(Result); + + CUcontext ContextHandle; + if (CUresult Result = cuDevicePrimaryCtxRetain(&ContextHandle, DeviceHandle)) + return CUresultToError(Result); + + if (CUresult Result = cuCtxSetCurrent(ContextHandle)) + return CUresultToError(Result); + + return CUDAPlatformDevice(DeviceIndex); +} + +CUDAPlatformDevice::CUDAPlatformDevice(CUDAPlatformDevice &&Other) noexcept + : DeviceIndex(Other.DeviceIndex) { + Other.DeviceIndex = -1; +} + +CUDAPlatformDevice &CUDAPlatformDevice:: +operator=(CUDAPlatformDevice &&Other) noexcept { + DeviceIndex = Other.DeviceIndex; + Other.DeviceIndex = -1; + return *this; +} + +CUDAPlatformDevice::~CUDAPlatformDevice() { + CUresult Result = cuDevicePrimaryCtxRelease(DeviceIndex); + // TODO(jhen): Log error. +} + +Expected +CUDAPlatformDevice::createKernel(const MultiKernelLoaderSpec &Spec) { + // TODO(jhen): Maybe first check loaded modules? + if (!Spec.hasCUDAPTXInMemory()) + return make_error("no CUDA source available to create kernel"); + + CUdevice Device = static_cast(DeviceIndex); + int ComputeCapabilityMajor = 0; + int ComputeCapabilityMinor = 0; + if (CUresult Result = cuDeviceGetAttribute( + &ComputeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + Device)) + return CUresultToError(Result); + if (CUresult Result = cuDeviceGetAttribute( + &ComputeCapabilityMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + Device)) + return CUresultToError(Result); + const char *Code = Spec.getCUDAPTXInMemory().getCode(ComputeCapabilityMajor, + ComputeCapabilityMinor); + + if (!Code) + return make_error("no suitable CUDA source found for compute capability " + + llvm::Twine(ComputeCapabilityMajor) + "." + + llvm::Twine(ComputeCapabilityMinor)); + + CUmodule Module; + if (CUresult Result = cuModuleLoadData(&Module, Code)) + return CUresultToError(Result); + + CUfunction Function; + if (CUresult Result = + cuModuleGetFunction(&Function, Module, Spec.getKernelName().c_str())) + return CUresultToError(Result); + + // TODO(jhen): Should I save this function pointer in case someone asks for + // it again? + + // TODO(jhen): Should I save the module pointer so I can unload it when I + // destroy this device? + + return static_cast(Function); +} + +Error CUDAPlatformDevice::destroyKernel(const void *Handle) { + // TODO(jhen): Maybe keep track of kernels for each module and unload the + // module after they are all destroyed. + return Error::success(); +} + +Expected CUDAPlatformDevice::createStream() { + CUstream Stream; + if (CUresult Result = cuStreamCreate(&Stream, CU_STREAM_DEFAULT)) + return CUresultToError(Result); + return Stream; +} + +Error CUDAPlatformDevice::destroyStream(const void *Handle) { + return CUresultToError( + cuStreamDestroy(static_cast(const_cast(Handle)))); +} + +Error CUDAPlatformDevice::launch( + const void *PlatformStreamHandle, BlockDimensions BlockSize, + GridDimensions GridSize, const void *PKernelHandle, + const PackedKernelArgumentArrayBase &ArgumentArray) { + CUfunction Function = + reinterpret_cast(const_cast(PKernelHandle)); + CUstream Stream = + reinterpret_cast(const_cast(PlatformStreamHandle)); + // TODO(jhen): Deal with shared memory arguments. + unsigned SharedMemoryBytes = 0; + void **ArgumentAddresses = const_cast(ArgumentArray.getAddresses()); + return CUresultToError(cuLaunchKernel( + Function, GridSize.X, GridSize.Y, GridSize.Z, BlockSize.X, BlockSize.Y, + BlockSize.Z, SharedMemoryBytes, Stream, ArgumentAddresses, nullptr)); +} + +Error CUDAPlatformDevice::copyD2H(const void *PlatformStreamHandle, + const void *DeviceSrcHandle, + size_t SrcByteOffset, void *HostDst, + size_t DstByteOffset, size_t ByteCount) { + return CUresultToError(cuMemcpyDtoHAsync( + offset(HostDst, DstByteOffset), + reinterpret_cast(offset(DeviceSrcHandle, SrcByteOffset)), + ByteCount, + static_cast(const_cast(PlatformStreamHandle)))); +} + +Error CUDAPlatformDevice::copyH2D(const void *PlatformStreamHandle, + const void *HostSrc, size_t SrcByteOffset, + const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) { + return CUresultToError(cuMemcpyHtoDAsync( + reinterpret_cast(offset(DeviceDstHandle, DstByteOffset)), + offset(HostSrc, SrcByteOffset), ByteCount, + static_cast(const_cast(PlatformStreamHandle)))); +} + +Error CUDAPlatformDevice::copyD2D(const void *PlatformStreamHandle, + const void *DeviceSrcHandle, + size_t SrcByteOffset, + const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) { + return CUresultToError(cuMemcpyDtoDAsync( + reinterpret_cast(offset(DeviceDstHandle, DstByteOffset)), + reinterpret_cast(offset(DeviceSrcHandle, SrcByteOffset)), + ByteCount, + static_cast(const_cast(PlatformStreamHandle)))); +} + +Error CUDAPlatformDevice::blockHostUntilDone(const void *PlatformStreamHandle) { + return CUresultToError(cuStreamSynchronize( + static_cast(const_cast(PlatformStreamHandle)))); +} + +Expected CUDAPlatformDevice::allocateDeviceMemory(size_t ByteCount) { + CUdeviceptr Pointer; + if (CUresult Result = cuMemAlloc(&Pointer, ByteCount)) + return CUresultToError(Result); + return reinterpret_cast(Pointer); +} + +Error CUDAPlatformDevice::freeDeviceMemory(const void *Handle) { + return CUresultToError(cuMemFree(reinterpret_cast(Handle))); +} + +Error CUDAPlatformDevice::registerHostMemory(void *Memory, size_t ByteCount) { + return CUresultToError(cuMemHostRegister(Memory, ByteCount, 0u)); +} + +Error CUDAPlatformDevice::unregisterHostMemory(const void *Memory) { + return CUresultToError(cuMemHostUnregister(const_cast(Memory))); +} + +Error CUDAPlatformDevice::synchronousCopyD2H(const void *DeviceSrcHandle, + size_t SrcByteOffset, + void *HostDst, + size_t DstByteOffset, + size_t ByteCount) { + return CUresultToError(cuMemcpyDtoH( + offset(HostDst, DstByteOffset), + reinterpret_cast(offset(DeviceSrcHandle, SrcByteOffset)), + ByteCount)); +} + +Error CUDAPlatformDevice::synchronousCopyH2D(const void *HostSrc, + size_t SrcByteOffset, + const void *DeviceDstHandle, + size_t DstByteOffset, + size_t ByteCount) { + return CUresultToError(cuMemcpyDtoH( + offset(DeviceDstHandle, DstByteOffset), + reinterpret_cast(offset(HostSrc, SrcByteOffset)), + ByteCount)); +} + +Error CUDAPlatformDevice::synchronousCopyD2D(const void *DeviceDstHandle, + size_t DstByteOffset, + const void *DeviceSrcHandle, + size_t SrcByteOffset, + size_t ByteCount) { + return CUresultToError(cuMemcpyDtoD( + reinterpret_cast(offset(DeviceDstHandle, DstByteOffset)), + reinterpret_cast(offset(DeviceSrcHandle, SrcByteOffset)), + ByteCount)); +} + +} // namespace cuda +} // namespace streamexecutor Index: streamexecutor/lib/platforms/cuda/cmake/modules/FindLibcuda.cmake =================================================================== --- /dev/null +++ streamexecutor/lib/platforms/cuda/cmake/modules/FindLibcuda.cmake @@ -0,0 +1,19 @@ +# - Try to find the libcuda library +# Once done this will define +# LIBCUDA_FOUND - System has libcuda +# LIBCUDA_INCLUDE_DIRS - The libcuda include directories +# LIBCUDA_LIBRARIES - The libraries needed to use libcuda + +find_path(LIBCUDA_INCLUDE_DIR cuda.h /usr/local/cuda/include) +find_library(LIBCUDA_LIBRARY cuda) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set LIBCUDA_FOUND to TRUE if +# all listed variables are TRUE +find_package_handle_standard_args( + LIBCUDA DEFAULT_MSG LIBCUDA_INCLUDE_DIR LIBCUDA_LIBRARY) + +mark_as_advanced(LIBCUDA_INCLUDE_DIR LIBCUDA_LIBRARY) + +set(LIBCUDA_LIBRARIES ${LIBCUDA_LIBRARY}) +set(LIBCUDA_INCLUDE_DIRS ${LIBCUDA_INCLUDE_DIR})