Index: parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp +++ parallel-libs/trunk/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp @@ -162,14 +162,41 @@ reinterpret_cast(const_cast(PKernelHandle)); CUstream Stream = reinterpret_cast(const_cast(PlatformStreamHandle)); - // TODO(jhen): Deal with shared memory arguments. - unsigned SharedMemoryBytes = 0; + + auto Launch = [Function, Stream, BlockSize, + GridSize](size_t SharedMemoryBytes, void **ArgumentAddresses) { + return CUresultToError( + cuLaunchKernel(Function, // + GridSize.X, GridSize.Y, GridSize.Z, // + BlockSize.X, BlockSize.Y, BlockSize.Z, // + SharedMemoryBytes, Stream, ArgumentAddresses, nullptr), + "cuLaunchKernel"); + }; + void **ArgumentAddresses = const_cast(ArgumentArray.getAddresses()); - return CUresultToError(cuLaunchKernel(Function, GridSize.X, GridSize.Y, - GridSize.Z, BlockSize.X, BlockSize.Y, - BlockSize.Z, SharedMemoryBytes, Stream, - ArgumentAddresses, nullptr), - "cuLaunchKernel"); + size_t SharedArgumentCount = ArgumentArray.getSharedCount(); + if (SharedArgumentCount) { + // The argument handling in this case is not very efficient. We may need to + // come back and optimize it later. + // + // Perhaps introduce another branch for the case where there is exactly one + // shared memory argument and it is the first one. This is the only case + // that will be used for compiler-generated CUDA kernels, and OpenCL users + // can choose to take advantage of it by combining their dynamic shared + // memory arguments and putting them first in the kernel signature. + unsigned SharedMemoryBytes = 0; + size_t ArgumentCount = ArgumentArray.getArgumentCount(); + llvm::SmallVector NonSharedArgumentAddresses( + ArgumentCount - SharedArgumentCount); + size_t NonSharedIndex = 0; + for (size_t I = 0; I < ArgumentCount; ++I) + if (ArgumentArray.getType(I) == KernelArgumentType::SHARED_DEVICE_MEMORY) + SharedMemoryBytes += ArgumentArray.getSize(I); + else + NonSharedArgumentAddresses[NonSharedIndex++] = ArgumentAddresses[I]; + return Launch(SharedMemoryBytes, NonSharedArgumentAddresses.data()); + } + return Launch(0, ArgumentAddresses); } Error CUDAPlatformDevice::copyD2H(const void *PlatformStreamHandle, Index: parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt =================================================================== --- parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt +++ parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt @@ -1,7 +1,12 @@ +if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM) + set(CUDA_TEST_SOURCES CUDATest.cpp) +endif() + add_se_unittest( CoreTests DeviceTest.cpp KernelSpecTest.cpp PackedKernelArgumentArrayTest.cpp StreamTest.cpp + ${CUDA_TEST_SOURCES} ) Index: parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp +++ parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp @@ -0,0 +1,215 @@ +//===-- CUDATest.cpp - Tests for CUDA platform ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the unit tests for CUDA platform code. +/// +//===----------------------------------------------------------------------===// + +#include "streamexecutor/StreamExecutor.h" + +#include "gtest/gtest.h" + +namespace { + +namespace compilergen { +using SaxpyKernel = + streamexecutor::Kernel, + streamexecutor::GlobalDeviceMemory>; + +const char *SaxpyPTX = R"( + .version 4.3 + .target sm_20 + .address_size 64 + + .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) { + .reg .f32 %AValue; + .reg .f32 %XValue; + .reg .f32 %YValue; + .reg .f32 %Result; + + .reg .b64 %XBaseAddrGeneric; + .reg .b64 %YBaseAddrGeneric; + .reg .b64 %XBaseAddrGlobal; + .reg .b64 %YBaseAddrGlobal; + .reg .b64 %XAddr; + .reg .b64 %YAddr; + .reg .b64 %ThreadByteOffset; + + .reg .b32 %TID; + + ld.param.f32 %AValue, [A]; + ld.param.u64 %XBaseAddrGeneric, [X]; + ld.param.u64 %YBaseAddrGeneric, [Y]; + cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric; + cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric; + mov.u32 %TID, %tid.x; + mul.wide.u32 %ThreadByteOffset, %TID, 4; + add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal; + add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal; + ld.global.f32 %XValue, [%XAddr]; + ld.global.f32 %YValue, [%YAddr]; + fma.rn.f32 %Result, %AValue, %XValue, %YValue; + st.global.f32 [%XAddr], %Result; + ret; + } +)"; + +static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { + streamexecutor::MultiKernelLoaderSpec Spec; + Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}}); + return Spec; +}(); + +using SwapPairsKernel = + streamexecutor::Kernel, + streamexecutor::GlobalDeviceMemory, int>; + +const char *SwapPairsPTX = R"( + .version 4.3 + .target sm_20 + .address_size 64 + + .extern .shared .align 4 .b8 SwapSpace[]; + + .visible .entry SwapPairs(.param .u64 InOut, .param .u32 InOutSize) { + .reg .b64 %InOutGeneric; + .reg .b32 %InOutSizeValue; + + .reg .b32 %LocalIndex; + .reg .b32 %PartnerIndex; + .reg .b32 %ThreadsPerBlock; + .reg .b32 %BlockIndex; + .reg .b32 %GlobalIndex; + + .reg .b32 %GlobalIndexBound; + .reg .pred %GlobalIndexTooHigh; + + .reg .b64 %InOutGlobal; + .reg .b64 %GlobalByteOffset; + .reg .b64 %GlobalAddress; + + .reg .b32 %InitialValue; + .reg .b32 %SwappedValue; + + .reg .b64 %SharedBaseAddr; + .reg .b64 %LocalWriteByteOffset; + .reg .b64 %LocalReadByteOffset; + .reg .b64 %SharedWriteAddr; + .reg .b64 %SharedReadAddr; + + ld.param.u64 %InOutGeneric, [InOut]; + ld.param.u32 %InOutSizeValue, [InOutSize]; + mov.u32 %LocalIndex, %tid.x; + mov.u32 %ThreadsPerBlock, %ntid.x; + mov.u32 %BlockIndex, %ctaid.x; + mad.lo.s32 %GlobalIndex, %ThreadsPerBlock, %BlockIndex, %LocalIndex; + and.b32 %GlobalIndexBound, %InOutSizeValue, -2; + setp.ge.s32 %GlobalIndexTooHigh, %GlobalIndex, %GlobalIndexBound; + @%GlobalIndexTooHigh bra END; + + cvta.to.global.u64 %InOutGlobal, %InOutGeneric; + mul.wide.s32 %GlobalByteOffset, %GlobalIndex, 4; + add.s64 %GlobalAddress, %InOutGlobal, %GlobalByteOffset; + ld.global.u32 %InitialValue, [%GlobalAddress]; + mul.wide.s32 %LocalWriteByteOffset, %LocalIndex, 4; + mov.u64 %SharedBaseAddr, SwapSpace; + add.s64 %SharedWriteAddr, %SharedBaseAddr, %LocalWriteByteOffset; + st.shared.u32 [%SharedWriteAddr], %InitialValue; + bar.sync 0; + xor.b32 %PartnerIndex, %LocalIndex, 1; + mul.wide.s32 %LocalReadByteOffset, %PartnerIndex, 4; + add.s64 %SharedReadAddr, %SharedBaseAddr, %LocalReadByteOffset; + ld.shared.u32 %SwappedValue, [%SharedReadAddr]; + st.global.u32 [%GlobalAddress], %SwappedValue; + + END: + ret; + } +)"; + +static streamexecutor::MultiKernelLoaderSpec SwapPairsLoaderSpec = []() { + streamexecutor::MultiKernelLoaderSpec Spec; + Spec.addCUDAPTXInMemory("SwapPairs", {{{2, 0}, SwapPairsPTX}}); + return Spec; +}(); +} // namespace compilergen + +namespace se = ::streamexecutor; +namespace cg = ::compilergen; + +class CUDATest : public ::testing::Test { +public: + CUDATest() + : Platform(getOrDie(se::PlatformManager::getPlatformByName("CUDA"))), + Device(getOrDie(Platform->getDevice(0))), + Stream(getOrDie(Device.createStream())) {} + + se::Platform *Platform; + se::Device Device; + se::Stream Stream; +}; + +TEST_F(CUDATest, Saxpy) { + float A = 42.0f; + std::vector HostX = {0, 1, 2, 3}; + std::vector HostY = {4, 5, 6, 7}; + size_t ArraySize = HostX.size(); + + cg::SaxpyKernel Kernel = + getOrDie(Device.createKernel(cg::SaxpyLoaderSpec)); + + se::RegisteredHostMemory RegisteredX = + getOrDie(Device.registerHostMemory(HostX)); + se::RegisteredHostMemory RegisteredY = + getOrDie(Device.registerHostMemory(HostY)); + + se::GlobalDeviceMemory X = + getOrDie(Device.allocateDeviceMemory(ArraySize)); + se::GlobalDeviceMemory Y = + getOrDie(Device.allocateDeviceMemory(ArraySize)); + + Stream.thenCopyH2D(RegisteredX, X) + .thenCopyH2D(RegisteredY, Y) + .thenLaunch(ArraySize, 1, Kernel, A, X, Y) + .thenCopyD2H(X, RegisteredX); + se::dieIfError(Stream.blockHostUntilDone()); + + std::vector ExpectedX = {4, 47, 90, 133}; + EXPECT_EQ(ExpectedX, HostX); +} + +TEST_F(CUDATest, DynamicSharedMemory) { + std::vector HostPairs = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + std::vector HostResult(HostPairs.size(), 0); + int ArraySize = HostPairs.size(); + + cg::SwapPairsKernel Kernel = getOrDie( + Device.createKernel(cg::SwapPairsLoaderSpec)); + + se::RegisteredHostMemory RegisteredPairs = + getOrDie(Device.registerHostMemory(HostPairs)); + se::RegisteredHostMemory RegisteredResult = + getOrDie(Device.registerHostMemory(HostResult)); + + se::GlobalDeviceMemory Pairs = + getOrDie(Device.allocateDeviceMemory(ArraySize)); + auto SharedMemory = + se::SharedDeviceMemory::makeFromElementCount(ArraySize); + + Stream.thenCopyH2D(RegisteredPairs, Pairs) + .thenLaunch(ArraySize, 1, Kernel, SharedMemory, Pairs, ArraySize) + .thenCopyD2H(Pairs, RegisteredResult); + se::dieIfError(Stream.blockHostUntilDone()); + + std::vector ExpectedPairs = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10}; + EXPECT_EQ(ExpectedPairs, HostResult); +} + +} // namespace