Index: parallel-libs/trunk/streamexecutor/CMakeLists.txt =================================================================== --- parallel-libs/trunk/streamexecutor/CMakeLists.txt +++ parallel-libs/trunk/streamexecutor/CMakeLists.txt @@ -3,9 +3,14 @@ option(STREAM_EXECUTOR_UNIT_TESTS "enable unit tests" ON) option(STREAM_EXECUTOR_ENABLE_DOXYGEN "enable StreamExecutor doxygen" ON) option(STREAM_EXECUTOR_ENABLE_CONFIG_TOOL "enable building streamexecutor-config tool" ON) +option(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM "enable building the CUDA StreamExecutor platform" OFF) + +configure_file("include/streamexecutor/PlatformOptions.h.in" "include/streamexecutor/PlatformOptions.h") # First find includes relative to the streamexecutor top-level source path. include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/include) +# Also look for configured headers in the top-level binary directory. +include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/include) # If we are not building as part of LLVM, build StreamExecutor as a standalone # project using LLVM as an external library: Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h @@ -37,25 +37,29 @@ virtual std::string getName() const = 0; + virtual std::string getPlatformName() const = 0; + /// Creates a platform-specific kernel. virtual Expected createKernel(const MultiKernelLoaderSpec &Spec) { - return make_error("createKernel not implemented for platform " + getName()); + return make_error("createKernel not implemented for platform " + + getPlatformName()); } virtual Error destroyKernel(const void *Handle) { return make_error("destroyKernel not implemented for platform " + - getName()); + getPlatformName()); } /// Creates a platform-specific stream. virtual Expected createStream() { - return make_error("createStream not implemented for platform " + getName()); + return make_error("createStream not implemented for platform " + + getPlatformName()); } virtual Error destroyStream(const void *Handle) { return make_error("destroyStream not implemented for platform " + - getName()); + getPlatformName()); } /// Launches a kernel on the given stream. @@ -63,7 +67,8 @@ BlockDimensions BlockSize, GridDimensions GridSize, const void *PKernelHandle, const PackedKernelArgumentArrayBase &ArgumentArray) { - return make_error("launch not implemented for platform " + getName()); + return make_error("launch not implemented for platform " + + getPlatformName()); } /// Copies data from the device to the host. @@ -72,7 +77,8 @@ virtual Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle, size_t SrcByteOffset, void *HostDst, size_t DstByteOffset, size_t ByteCount) { - return make_error("copyD2H not implemented for platform " + getName()); + return make_error("copyD2H not implemented for platform " + + getPlatformName()); } /// Copies data from the host to the device. @@ -81,7 +87,8 @@ virtual Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc, size_t SrcByteOffset, const void *DeviceDstHandle, size_t DstByteOffset, size_t ByteCount) { - return make_error("copyH2D not implemented for platform " + getName()); + return make_error("copyH2D not implemented for platform " + + getPlatformName()); } /// Copies data from one device location to another. @@ -89,39 +96,40 @@ const void *DeviceSrcHandle, size_t SrcByteOffset, const void *DeviceDstHandle, size_t DstByteOffset, size_t ByteCount) { - return make_error("copyD2D not implemented for platform " + getName()); + return make_error("copyD2D not implemented for platform " + + getPlatformName()); } /// Blocks the host until the given stream completes all the work enqueued up /// to the point this function is called. virtual Error blockHostUntilDone(const void *PlatformStreamHandle) { return make_error("blockHostUntilDone not implemented for platform " + - getName()); + getPlatformName()); } /// Allocates untyped device memory of a given size in bytes. virtual Expected allocateDeviceMemory(size_t ByteCount) { return make_error("allocateDeviceMemory not implemented for platform " + - getName()); + getPlatformName()); } /// Frees device memory previously allocated by allocateDeviceMemory. virtual Error freeDeviceMemory(const void *Handle) { return make_error("freeDeviceMemory not implemented for platform " + - getName()); + getPlatformName()); } /// Registers previously allocated host memory so it can be used with copyH2D /// and copyD2H. virtual Error registerHostMemory(void *Memory, size_t ByteCount) { return make_error("registerHostMemory not implemented for platform " + - getName()); + getPlatformName()); } /// Unregisters host memory previously registered with registerHostMemory. virtual Error unregisterHostMemory(const void *Memory) { return make_error("unregisterHostMemory not implemented for platform " + - getName()); + getPlatformName()); } /// Copies the given number of bytes from device memory to host memory. @@ -133,7 +141,7 @@ size_t SrcByteOffset, void *HostDst, size_t DstByteOffset, size_t ByteCount) { return make_error("synchronousCopyD2H not implemented for platform " + - getName()); + getPlatformName()); } /// Similar to synchronousCopyD2H(const void *, size_t, void @@ -143,7 +151,7 @@ const void *DeviceDstHandle, size_t DstByteOffset, size_t ByteCount) { return make_error("synchronousCopyH2D not implemented for platform " + - getName()); + getPlatformName()); } /// Similar to synchronousCopyD2H(const void *, size_t, void @@ -154,7 +162,7 @@ const void *DeviceDstHandle, size_t DstByteOffset, size_t ByteCount) { return make_error("synchronousCopyD2D not implemented for platform " + - getName()); + getPlatformName()); } }; Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformOptions.h.in =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformOptions.h.in +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformOptions.h.in @@ -0,0 +1,23 @@ +//===-- PlatformOptions.h - Platform option macros --------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This contents of this file are filled in at configuration time. This file +/// defines macros that represent the platform configuration state of the build, +/// e.g. which platforms are enabled. +/// +//===----------------------------------------------------------------------===// + + +#ifndef STREAMEXECUTOR_PLATFORMOPTIONS_H +#define STREAMEXECUTOR_PLATFORMOPTIONS_H + +#cmakedefine STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM + +#endif // STREAMEXECUTOR_PLATFORMOPTIONS_H Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatform.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatform.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatform.h @@ -0,0 +1,42 @@ +//===-- CUDAPlatform.h - CUDA platform subclass -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Declaration of the CUDAPlatform class. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H +#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H + +#include "streamexecutor/Platform.h" +#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h" + +#include "llvm/Support/Mutex.h" + +#include + +namespace streamexecutor { +namespace cuda { + +class CUDAPlatform : public Platform { +public: + size_t getDeviceCount() const override; + + Expected getDevice(size_t DeviceIndex) override; + +private: + llvm::sys::Mutex Mutex; + std::map PlatformDevices; +}; + +} // namespace cuda +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h @@ -0,0 +1,93 @@ +//===-- CUDAPlatformDevice.h - CUDAPlatformDevice class ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Declaration of the CUDAPlatformDevice class. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H +#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H + +#include "streamexecutor/PlatformDevice.h" + +namespace streamexecutor { +namespace cuda { + +Error CUresultToError(int CUResult, const llvm::Twine &Message); + +class CUDAPlatformDevice : public PlatformDevice { +public: + static Expected create(size_t DeviceIndex); + + CUDAPlatformDevice(const CUDAPlatformDevice &) = delete; + CUDAPlatformDevice &operator=(const CUDAPlatformDevice &) = delete; + + CUDAPlatformDevice(CUDAPlatformDevice &&) noexcept; + CUDAPlatformDevice &operator=(CUDAPlatformDevice &&) noexcept; + + ~CUDAPlatformDevice() override; + + std::string getName() const override; + + std::string getPlatformName() const override { return "CUDA"; } + + Expected + createKernel(const MultiKernelLoaderSpec &Spec) override; + Error destroyKernel(const void *Handle) override; + + Expected createStream() override; + Error destroyStream(const void *Handle) override; + + Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize, + GridDimensions GridSize, const void *PKernelHandle, + const PackedKernelArgumentArrayBase &ArgumentArray) override; + + Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle, + size_t SrcByteOffset, void *HostDst, size_t DstByteOffset, + size_t ByteCount) override; + + Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc, + size_t SrcByteOffset, const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) override; + + Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle, + size_t SrcByteOffset, const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) override; + + Error blockHostUntilDone(const void *PlatformStreamHandle) override; + + Expected allocateDeviceMemory(size_t ByteCount) override; + Error freeDeviceMemory(const void *Handle) override; + + Error registerHostMemory(void *Memory, size_t ByteCount) override; + Error unregisterHostMemory(const void *Memory) override; + + Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset, + void *HostDst, size_t DstByteOffset, + size_t ByteCount) override; + + Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset, + const void *DeviceDstHandle, size_t DstByteOffset, + size_t ByteCount) override; + + Error synchronousCopyD2D(const void *DeviceDstHandle, size_t DstByteOffset, + const void *DeviceSrcHandle, size_t SrcByteOffset, + size_t ByteCount) override; + +private: + CUDAPlatformDevice(size_t DeviceIndex) : DeviceIndex(DeviceIndex) {} + + int DeviceIndex; +}; + +} // namespace cuda +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H Index: parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h =================================================================== --- parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h +++ parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h @@ -29,6 +29,8 @@ public: std::string getName() const override { return "host"; } + std::string getPlatformName() const override { return "host"; } + Expected createKernel(const MultiKernelLoaderSpec &Spec) override { if (!Spec.hasHostFunction()) { Index: parallel-libs/trunk/streamexecutor/lib/CMakeLists.txt =================================================================== --- parallel-libs/trunk/streamexecutor/lib/CMakeLists.txt +++ parallel-libs/trunk/streamexecutor/lib/CMakeLists.txt @@ -3,6 +3,26 @@ set_target_properties(${name} PROPERTIES FOLDER "streamexecutor libraries") endmacro(add_se_library) +if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM) + set( + CMAKE_MODULE_PATH + ${CMAKE_MODULE_PATH} + "${CMAKE_CURRENT_SOURCE_DIR}/platforms/cuda/cmake/modules/") + + find_package(Libcuda REQUIRED) + include_directories(${LIBCUDA_INCLUDE_DIRS}) + + set( + STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT + $) + + set( + STREAM_EXECUTOR_LIBCUDA_LIBRARIES + ${LIBCUDA_LIBRARIES}) +endif(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM) + +add_subdirectory(platforms) + add_se_library( streamexecutor Device.cpp @@ -16,6 +36,8 @@ PlatformDevice.cpp PlatformManager.cpp Stream.cpp - ) + ${STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT} + LINK_LIBS + ${STREAM_EXECUTOR_LIBCUDA_LIBRARIES}) install(TARGETS streamexecutor DESTINATION lib) Index: parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp +++ parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp @@ -13,8 +13,14 @@ //===----------------------------------------------------------------------===// #include "streamexecutor/PlatformManager.h" + +#include "streamexecutor/PlatformOptions.h" #include "streamexecutor/platforms/host/HostPlatform.h" +#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM +#include "streamexecutor/platforms/cuda/CUDAPlatform.h" +#endif + namespace streamexecutor { PlatformManager::PlatformManager() { @@ -26,6 +32,10 @@ // themselves when they are loaded. PlatformsByName.emplace("host", llvm::make_unique()); + +#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM + PlatformsByName.emplace("cuda", llvm::make_unique()); +#endif } Expected PlatformManager::getPlatformByName(llvm::StringRef Name) { Index: parallel-libs/trunk/streamexecutor/lib/platforms/CMakeLists.txt =================================================================== --- parallel-libs/trunk/streamexecutor/lib/platforms/CMakeLists.txt +++ parallel-libs/trunk/streamexecutor/lib/platforms/CMakeLists.txt @@ -0,0 +1,3 @@ +if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM) + add_subdirectory(cuda) +endif() Index: parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CMakeLists.txt =================================================================== --- parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CMakeLists.txt +++ parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CMakeLists.txt @@ -0,0 +1,5 @@ +add_library( + streamexecutor_cuda_platform + OBJECT + CUDAPlatform.cpp + CUDAPlatformDevice.cpp) Index: parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp +++ parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp @@ -0,0 +1,65 @@ +//===-- CUDAPlatform.cpp - CUDA platform implementation -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Implementation of CUDA platform internals. +/// +//===----------------------------------------------------------------------===// + +#include "streamexecutor/platforms/cuda/CUDAPlatform.h" +#include "streamexecutor/Device.h" +#include "streamexecutor/Platform.h" +#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h" + +#include "llvm/Support/Mutex.h" + +#include "cuda.h" + +#include + +namespace streamexecutor { +namespace cuda { + +static CUresult ensureCUDAInitialized() { + static CUresult InitResult = []() { return cuInit(0); }(); + return InitResult; +} + +size_t CUDAPlatform::getDeviceCount() const { + if (ensureCUDAInitialized()) + // TODO(jhen): Log an error. + return 0; + + int DeviceCount = 0; + CUresult Result = cuDeviceGetCount(&DeviceCount); + (void)Result; + // TODO(jhen): Log an error. + + return DeviceCount; +} + +Expected CUDAPlatform::getDevice(size_t DeviceIndex) { + if (CUresult InitResult = ensureCUDAInitialized()) + return CUresultToError(InitResult, "cached cuInit return value"); + + llvm::sys::ScopedLock Lock(Mutex); + auto Iterator = PlatformDevices.find(DeviceIndex); + if (Iterator == PlatformDevices.end()) { + if (auto MaybePDevice = CUDAPlatformDevice::create(DeviceIndex)) { + Iterator = + PlatformDevices.emplace(DeviceIndex, std::move(*MaybePDevice)).first; + } else { + return MaybePDevice.takeError(); + } + } + return Device(&Iterator->second); +} + +} // namespace cuda +} // namespace streamexecutor Index: parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp +++ parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp @@ -0,0 +1,280 @@ +//===-- CUDAPlatformDevice.cpp - CUDAPlatformDevice implementation --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Implementation of CUDAPlatformDevice. +/// +//===----------------------------------------------------------------------===// + +#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h" +#include "streamexecutor/PlatformDevice.h" + +#include "cuda.h" + +namespace streamexecutor { +namespace cuda { + +static void *offset(const void *Base, size_t Offset) { + return const_cast(static_cast(Base) + Offset); +} + +Error CUresultToError(int CUResult, const llvm::Twine &Message) { + CUresult Result = static_cast(CUResult); + if (Result) { + const char *ErrorName; + if (cuGetErrorName(Result, &ErrorName)) + ErrorName = "UNKNOWN ERROR NAME"; + const char *ErrorString; + if (cuGetErrorString(Result, &ErrorString)) + ErrorString = "UNKNOWN ERROR DESCRIPTION"; + return make_error("CUDA driver error: '" + Message + "', error code = " + + llvm::Twine(static_cast(Result)) + ", name = " + + ErrorName + ", description = '" + ErrorString + "'"); + } else + return Error::success(); +} + +std::string CUDAPlatformDevice::getName() const { + static std::string CachedName = [](int DeviceIndex) { + static constexpr size_t MAX_DRIVER_NAME_BYTES = 1024; + std::string Name = "CUDA device " + std::to_string(DeviceIndex); + char NameFromDriver[MAX_DRIVER_NAME_BYTES]; + if (!cuDeviceGetName(NameFromDriver, MAX_DRIVER_NAME_BYTES - 1, + DeviceIndex)) { + NameFromDriver[MAX_DRIVER_NAME_BYTES - 1] = '\0'; + Name.append(": ").append(NameFromDriver); + } + return Name; + }(DeviceIndex); + return CachedName; +} + +Expected CUDAPlatformDevice::create(size_t DeviceIndex) { + CUdevice DeviceHandle; + if (CUresult Result = cuDeviceGet(&DeviceHandle, DeviceIndex)) + return CUresultToError(Result, "cuDeviceGet"); + + CUcontext ContextHandle; + if (CUresult Result = cuDevicePrimaryCtxRetain(&ContextHandle, DeviceHandle)) + return CUresultToError(Result, "cuDevicePrimaryCtxRetain"); + + if (CUresult Result = cuCtxSetCurrent(ContextHandle)) + return CUresultToError(Result, "cuCtxSetCurrent"); + + return CUDAPlatformDevice(DeviceIndex); +} + +CUDAPlatformDevice::CUDAPlatformDevice(CUDAPlatformDevice &&Other) noexcept + : DeviceIndex(Other.DeviceIndex) { + Other.DeviceIndex = -1; +} + +CUDAPlatformDevice &CUDAPlatformDevice:: +operator=(CUDAPlatformDevice &&Other) noexcept { + DeviceIndex = Other.DeviceIndex; + Other.DeviceIndex = -1; + return *this; +} + +CUDAPlatformDevice::~CUDAPlatformDevice() { + CUresult Result = cuDevicePrimaryCtxRelease(DeviceIndex); + (void)Result; + // TODO(jhen): Log error. +} + +Expected +CUDAPlatformDevice::createKernel(const MultiKernelLoaderSpec &Spec) { + // TODO(jhen): Maybe first check loaded modules? + if (!Spec.hasCUDAPTXInMemory()) + return make_error("no CUDA code available to create kernel"); + + CUdevice Device = static_cast(DeviceIndex); + int ComputeCapabilityMajor = 0; + int ComputeCapabilityMinor = 0; + if (CUresult Result = cuDeviceGetAttribute( + &ComputeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + Device)) + return CUresultToError( + Result, + "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"); + if (CUresult Result = cuDeviceGetAttribute( + &ComputeCapabilityMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + Device)) + return CUresultToError( + Result, + "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"); + const char *Code = Spec.getCUDAPTXInMemory().getCode(ComputeCapabilityMajor, + ComputeCapabilityMinor); + + if (!Code) + return make_error("no suitable CUDA source found for compute capability " + + llvm::Twine(ComputeCapabilityMajor) + "." + + llvm::Twine(ComputeCapabilityMinor)); + + CUmodule Module; + if (CUresult Result = cuModuleLoadData(&Module, Code)) + return CUresultToError(Result, "cuModuleLoadData"); + + CUfunction Function; + if (CUresult Result = + cuModuleGetFunction(&Function, Module, Spec.getKernelName().c_str())) + return CUresultToError(Result, "cuModuleGetFunction"); + + // TODO(jhen): Should I save this function pointer in case someone asks for + // it again? + + // TODO(jhen): Should I save the module pointer so I can unload it when I + // destroy this device? + + return static_cast(Function); +} + +Error CUDAPlatformDevice::destroyKernel(const void *Handle) { + // TODO(jhen): Maybe keep track of kernels for each module and unload the + // module after they are all destroyed. + return Error::success(); +} + +Expected CUDAPlatformDevice::createStream() { + CUstream Stream; + if (CUresult Result = cuStreamCreate(&Stream, CU_STREAM_DEFAULT)) + return CUresultToError(Result, "cuStreamCreate"); + return Stream; +} + +Error CUDAPlatformDevice::destroyStream(const void *Handle) { + return CUresultToError( + cuStreamDestroy(static_cast(const_cast(Handle))), + "cuStreamDestroy"); +} + +Error CUDAPlatformDevice::launch( + const void *PlatformStreamHandle, BlockDimensions BlockSize, + GridDimensions GridSize, const void *PKernelHandle, + const PackedKernelArgumentArrayBase &ArgumentArray) { + CUfunction Function = + reinterpret_cast(const_cast(PKernelHandle)); + CUstream Stream = + reinterpret_cast(const_cast(PlatformStreamHandle)); + // TODO(jhen): Deal with shared memory arguments. + unsigned SharedMemoryBytes = 0; + void **ArgumentAddresses = const_cast(ArgumentArray.getAddresses()); + return CUresultToError(cuLaunchKernel(Function, GridSize.X, GridSize.Y, + GridSize.Z, BlockSize.X, BlockSize.Y, + BlockSize.Z, SharedMemoryBytes, Stream, + ArgumentAddresses, nullptr), + "cuLaunchKernel"); +} + +Error CUDAPlatformDevice::copyD2H(const void *PlatformStreamHandle, + const void *DeviceSrcHandle, + size_t SrcByteOffset, void *HostDst, + size_t DstByteOffset, size_t ByteCount) { + return CUresultToError( + cuMemcpyDtoHAsync( + offset(HostDst, DstByteOffset), + reinterpret_cast(offset(DeviceSrcHandle, SrcByteOffset)), + ByteCount, + static_cast(const_cast(PlatformStreamHandle))), + "cuMemcpyDtoHAsync"); +} + +Error CUDAPlatformDevice::copyH2D(const void *PlatformStreamHandle, + const void *HostSrc, size_t SrcByteOffset, + const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) { + return CUresultToError( + cuMemcpyHtoDAsync( + reinterpret_cast(offset(DeviceDstHandle, DstByteOffset)), + offset(HostSrc, SrcByteOffset), ByteCount, + static_cast(const_cast(PlatformStreamHandle))), + "cuMemcpyHtoDAsync"); +} + +Error CUDAPlatformDevice::copyD2D(const void *PlatformStreamHandle, + const void *DeviceSrcHandle, + size_t SrcByteOffset, + const void *DeviceDstHandle, + size_t DstByteOffset, size_t ByteCount) { + return CUresultToError( + cuMemcpyDtoDAsync( + reinterpret_cast(offset(DeviceDstHandle, DstByteOffset)), + reinterpret_cast(offset(DeviceSrcHandle, SrcByteOffset)), + ByteCount, + static_cast(const_cast(PlatformStreamHandle))), + "cuMemcpyDtoDAsync"); +} + +Error CUDAPlatformDevice::blockHostUntilDone(const void *PlatformStreamHandle) { + return CUresultToError(cuStreamSynchronize(static_cast( + const_cast(PlatformStreamHandle))), + "cuStreamSynchronize"); +} + +Expected CUDAPlatformDevice::allocateDeviceMemory(size_t ByteCount) { + CUdeviceptr Pointer; + if (CUresult Result = cuMemAlloc(&Pointer, ByteCount)) + return CUresultToError(Result, "cuMemAlloc"); + return reinterpret_cast(Pointer); +} + +Error CUDAPlatformDevice::freeDeviceMemory(const void *Handle) { + return CUresultToError(cuMemFree(reinterpret_cast(Handle)), + "cuMemFree"); +} + +Error CUDAPlatformDevice::registerHostMemory(void *Memory, size_t ByteCount) { + return CUresultToError(cuMemHostRegister(Memory, ByteCount, 0u), + "cuMemHostRegister"); +} + +Error CUDAPlatformDevice::unregisterHostMemory(const void *Memory) { + return CUresultToError(cuMemHostUnregister(const_cast(Memory)), + "cuMemHostUnregister"); +} + +Error CUDAPlatformDevice::synchronousCopyD2H(const void *DeviceSrcHandle, + size_t SrcByteOffset, + void *HostDst, + size_t DstByteOffset, + size_t ByteCount) { + return CUresultToError(cuMemcpyDtoH(offset(HostDst, DstByteOffset), + reinterpret_cast(offset( + DeviceSrcHandle, SrcByteOffset)), + ByteCount), + "cuMemcpyDtoH"); +} + +Error CUDAPlatformDevice::synchronousCopyH2D(const void *HostSrc, + size_t SrcByteOffset, + const void *DeviceDstHandle, + size_t DstByteOffset, + size_t ByteCount) { + return CUresultToError( + cuMemcpyHtoD( + reinterpret_cast(offset(DeviceDstHandle, DstByteOffset)), + offset(HostSrc, SrcByteOffset), ByteCount), + "cuMemcpyHtoD"); +} + +Error CUDAPlatformDevice::synchronousCopyD2D(const void *DeviceDstHandle, + size_t DstByteOffset, + const void *DeviceSrcHandle, + size_t SrcByteOffset, + size_t ByteCount) { + return CUresultToError( + cuMemcpyDtoD( + reinterpret_cast(offset(DeviceDstHandle, DstByteOffset)), + reinterpret_cast(offset(DeviceSrcHandle, SrcByteOffset)), + ByteCount), + "cuMemcpyDtoD"); +} + +} // namespace cuda +} // namespace streamexecutor Index: parallel-libs/trunk/streamexecutor/lib/platforms/cuda/cmake/modules/FindLibcuda.cmake =================================================================== --- parallel-libs/trunk/streamexecutor/lib/platforms/cuda/cmake/modules/FindLibcuda.cmake +++ parallel-libs/trunk/streamexecutor/lib/platforms/cuda/cmake/modules/FindLibcuda.cmake @@ -0,0 +1,21 @@ +# - Try to find the libcuda library +# Once done this will define +# LIBCUDA_FOUND - System has libcuda +# LIBCUDA_INCLUDE_DIRS - The libcuda include directories +# LIBCUDA_LIBRARIES - The libraries needed to use libcuda + +# TODO(jhen): Allow users to specify a search path. +find_path(LIBCUDA_INCLUDE_DIR cuda.h /usr/local/cuda/include) +# TODO(jhen): Use the library that goes with the headers. +find_library(LIBCUDA_LIBRARY cuda) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set LIBCUDA_FOUND to TRUE if +# all listed variables are TRUE +find_package_handle_standard_args( + LIBCUDA DEFAULT_MSG LIBCUDA_INCLUDE_DIR LIBCUDA_LIBRARY) + +mark_as_advanced(LIBCUDA_INCLUDE_DIR LIBCUDA_LIBRARY) + +set(LIBCUDA_LIBRARIES ${LIBCUDA_LIBRARY}) +set(LIBCUDA_INCLUDE_DIRS ${LIBCUDA_INCLUDE_DIR})