Index: parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt =================================================================== --- parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt +++ parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt @@ -1,5 +1,5 @@ if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM) - set(CUDA_TEST_SOURCES CUDATest.cpp) + set(CUDA_TEST_SOURCES cuda/CUDATest.cpp) endif() add_se_unittest( Index: parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp +++ parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp @@ -1,215 +0,0 @@ -//===-- CUDATest.cpp - Tests for CUDA platform ----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file contains the unit tests for CUDA platform code. -/// -//===----------------------------------------------------------------------===// - -#include "streamexecutor/StreamExecutor.h" - -#include "gtest/gtest.h" - -namespace { - -namespace compilergen { -using SaxpyKernel = - streamexecutor::Kernel, - streamexecutor::GlobalDeviceMemory>; - -const char *SaxpyPTX = R"( - .version 4.3 - .target sm_20 - .address_size 64 - - .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) { - .reg .f32 %AValue; - .reg .f32 %XValue; - .reg .f32 %YValue; - .reg .f32 %Result; - - .reg .b64 %XBaseAddrGeneric; - .reg .b64 %YBaseAddrGeneric; - .reg .b64 %XBaseAddrGlobal; - .reg .b64 %YBaseAddrGlobal; - .reg .b64 %XAddr; - .reg .b64 %YAddr; - .reg .b64 %ThreadByteOffset; - - .reg .b32 %TID; - - ld.param.f32 %AValue, [A]; - ld.param.u64 %XBaseAddrGeneric, [X]; - ld.param.u64 %YBaseAddrGeneric, [Y]; - cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric; - cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric; - mov.u32 %TID, %tid.x; - mul.wide.u32 %ThreadByteOffset, %TID, 4; - add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal; - add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal; - ld.global.f32 %XValue, [%XAddr]; - ld.global.f32 %YValue, [%YAddr]; - fma.rn.f32 %Result, %AValue, %XValue, %YValue; - st.global.f32 [%XAddr], %Result; - ret; - } -)"; - -static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { - streamexecutor::MultiKernelLoaderSpec Spec; - Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}}); - return Spec; -}(); - -using SwapPairsKernel = - streamexecutor::Kernel, - streamexecutor::GlobalDeviceMemory, int>; - -const char *SwapPairsPTX = R"( - .version 4.3 - .target sm_20 - .address_size 64 - - .extern .shared .align 4 .b8 SwapSpace[]; - - .visible .entry SwapPairs(.param .u64 InOut, .param .u32 InOutSize) { - .reg .b64 %InOutGeneric; - .reg .b32 %InOutSizeValue; - - .reg .b32 %LocalIndex; - .reg .b32 %PartnerIndex; - .reg .b32 %ThreadsPerBlock; - .reg .b32 %BlockIndex; - .reg .b32 %GlobalIndex; - - .reg .b32 %GlobalIndexBound; - .reg .pred %GlobalIndexTooHigh; - - .reg .b64 %InOutGlobal; - .reg .b64 %GlobalByteOffset; - .reg .b64 %GlobalAddress; - - .reg .b32 %InitialValue; - .reg .b32 %SwappedValue; - - .reg .b64 %SharedBaseAddr; - .reg .b64 %LocalWriteByteOffset; - .reg .b64 %LocalReadByteOffset; - .reg .b64 %SharedWriteAddr; - .reg .b64 %SharedReadAddr; - - ld.param.u64 %InOutGeneric, [InOut]; - ld.param.u32 %InOutSizeValue, [InOutSize]; - mov.u32 %LocalIndex, %tid.x; - mov.u32 %ThreadsPerBlock, %ntid.x; - mov.u32 %BlockIndex, %ctaid.x; - mad.lo.s32 %GlobalIndex, %ThreadsPerBlock, %BlockIndex, %LocalIndex; - and.b32 %GlobalIndexBound, %InOutSizeValue, -2; - setp.ge.s32 %GlobalIndexTooHigh, %GlobalIndex, %GlobalIndexBound; - @%GlobalIndexTooHigh bra END; - - cvta.to.global.u64 %InOutGlobal, %InOutGeneric; - mul.wide.s32 %GlobalByteOffset, %GlobalIndex, 4; - add.s64 %GlobalAddress, %InOutGlobal, %GlobalByteOffset; - ld.global.u32 %InitialValue, [%GlobalAddress]; - mul.wide.s32 %LocalWriteByteOffset, %LocalIndex, 4; - mov.u64 %SharedBaseAddr, SwapSpace; - add.s64 %SharedWriteAddr, %SharedBaseAddr, %LocalWriteByteOffset; - st.shared.u32 [%SharedWriteAddr], %InitialValue; - bar.sync 0; - xor.b32 %PartnerIndex, %LocalIndex, 1; - mul.wide.s32 %LocalReadByteOffset, %PartnerIndex, 4; - add.s64 %SharedReadAddr, %SharedBaseAddr, %LocalReadByteOffset; - ld.shared.u32 %SwappedValue, [%SharedReadAddr]; - st.global.u32 [%GlobalAddress], %SwappedValue; - - END: - ret; - } -)"; - -static streamexecutor::MultiKernelLoaderSpec SwapPairsLoaderSpec = []() { - streamexecutor::MultiKernelLoaderSpec Spec; - Spec.addCUDAPTXInMemory("SwapPairs", {{{2, 0}, SwapPairsPTX}}); - return Spec; -}(); -} // namespace compilergen - -namespace se = ::streamexecutor; -namespace cg = ::compilergen; - -class CUDATest : public ::testing::Test { -public: - CUDATest() - : Platform(getOrDie(se::PlatformManager::getPlatformByName("CUDA"))), - Device(getOrDie(Platform->getDevice(0))), - Stream(getOrDie(Device.createStream())) {} - - se::Platform *Platform; - se::Device Device; - se::Stream Stream; -}; - -TEST_F(CUDATest, Saxpy) { - float A = 42.0f; - std::vector HostX = {0, 1, 2, 3}; - std::vector HostY = {4, 5, 6, 7}; - size_t ArraySize = HostX.size(); - - cg::SaxpyKernel Kernel = - getOrDie(Device.createKernel(cg::SaxpyLoaderSpec)); - - se::RegisteredHostMemory RegisteredX = - getOrDie(Device.registerHostMemory(HostX)); - se::RegisteredHostMemory RegisteredY = - getOrDie(Device.registerHostMemory(HostY)); - - se::GlobalDeviceMemory X = - getOrDie(Device.allocateDeviceMemory(ArraySize)); - se::GlobalDeviceMemory Y = - getOrDie(Device.allocateDeviceMemory(ArraySize)); - - Stream.thenCopyH2D(RegisteredX, X) - .thenCopyH2D(RegisteredY, Y) - .thenLaunch(ArraySize, 1, Kernel, A, X, Y) - .thenCopyD2H(X, RegisteredX); - se::dieIfError(Stream.blockHostUntilDone()); - - std::vector ExpectedX = {4, 47, 90, 133}; - EXPECT_EQ(ExpectedX, HostX); -} - -TEST_F(CUDATest, DynamicSharedMemory) { - std::vector HostPairs = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - std::vector HostResult(HostPairs.size(), 0); - int ArraySize = HostPairs.size(); - - cg::SwapPairsKernel Kernel = getOrDie( - Device.createKernel(cg::SwapPairsLoaderSpec)); - - se::RegisteredHostMemory RegisteredPairs = - getOrDie(Device.registerHostMemory(HostPairs)); - se::RegisteredHostMemory RegisteredResult = - getOrDie(Device.registerHostMemory(HostResult)); - - se::GlobalDeviceMemory Pairs = - getOrDie(Device.allocateDeviceMemory(ArraySize)); - auto SharedMemory = - se::SharedDeviceMemory::makeFromElementCount(ArraySize); - - Stream.thenCopyH2D(RegisteredPairs, Pairs) - .thenLaunch(ArraySize, 1, Kernel, SharedMemory, Pairs, ArraySize) - .thenCopyD2H(Pairs, RegisteredResult); - se::dieIfError(Stream.blockHostUntilDone()); - - std::vector ExpectedPairs = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10}; - EXPECT_EQ(ExpectedPairs, HostResult); -} - -} // namespace Index: parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/CUDATest.cpp =================================================================== --- parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/CUDATest.cpp +++ parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/CUDATest.cpp @@ -0,0 +1,215 @@ +//===-- CUDATest.cpp - Tests for CUDA platform ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the unit tests for CUDA platform code. +/// +//===----------------------------------------------------------------------===// + +#include "streamexecutor/StreamExecutor.h" + +#include "gtest/gtest.h" + +namespace { + +namespace compilergen { +using SaxpyKernel = + streamexecutor::Kernel, + streamexecutor::GlobalDeviceMemory>; + +const char *SaxpyPTX = R"( + .version 4.3 + .target sm_20 + .address_size 64 + + .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) { + .reg .f32 %AValue; + .reg .f32 %XValue; + .reg .f32 %YValue; + .reg .f32 %Result; + + .reg .b64 %XBaseAddrGeneric; + .reg .b64 %YBaseAddrGeneric; + .reg .b64 %XBaseAddrGlobal; + .reg .b64 %YBaseAddrGlobal; + .reg .b64 %XAddr; + .reg .b64 %YAddr; + .reg .b64 %ThreadByteOffset; + + .reg .b32 %TID; + + ld.param.f32 %AValue, [A]; + ld.param.u64 %XBaseAddrGeneric, [X]; + ld.param.u64 %YBaseAddrGeneric, [Y]; + cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric; + cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric; + mov.u32 %TID, %tid.x; + mul.wide.u32 %ThreadByteOffset, %TID, 4; + add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal; + add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal; + ld.global.f32 %XValue, [%XAddr]; + ld.global.f32 %YValue, [%YAddr]; + fma.rn.f32 %Result, %AValue, %XValue, %YValue; + st.global.f32 [%XAddr], %Result; + ret; + } +)"; + +static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { + streamexecutor::MultiKernelLoaderSpec Spec; + Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}}); + return Spec; +}(); + +using SwapPairsKernel = + streamexecutor::Kernel, + streamexecutor::GlobalDeviceMemory, int>; + +const char *SwapPairsPTX = R"( + .version 4.3 + .target sm_20 + .address_size 64 + + .extern .shared .align 4 .b8 SwapSpace[]; + + .visible .entry SwapPairs(.param .u64 InOut, .param .u32 InOutSize) { + .reg .b64 %InOutGeneric; + .reg .b32 %InOutSizeValue; + + .reg .b32 %LocalIndex; + .reg .b32 %PartnerIndex; + .reg .b32 %ThreadsPerBlock; + .reg .b32 %BlockIndex; + .reg .b32 %GlobalIndex; + + .reg .b32 %GlobalIndexBound; + .reg .pred %GlobalIndexTooHigh; + + .reg .b64 %InOutGlobal; + .reg .b64 %GlobalByteOffset; + .reg .b64 %GlobalAddress; + + .reg .b32 %InitialValue; + .reg .b32 %SwappedValue; + + .reg .b64 %SharedBaseAddr; + .reg .b64 %LocalWriteByteOffset; + .reg .b64 %LocalReadByteOffset; + .reg .b64 %SharedWriteAddr; + .reg .b64 %SharedReadAddr; + + ld.param.u64 %InOutGeneric, [InOut]; + ld.param.u32 %InOutSizeValue, [InOutSize]; + mov.u32 %LocalIndex, %tid.x; + mov.u32 %ThreadsPerBlock, %ntid.x; + mov.u32 %BlockIndex, %ctaid.x; + mad.lo.s32 %GlobalIndex, %ThreadsPerBlock, %BlockIndex, %LocalIndex; + and.b32 %GlobalIndexBound, %InOutSizeValue, -2; + setp.ge.s32 %GlobalIndexTooHigh, %GlobalIndex, %GlobalIndexBound; + @%GlobalIndexTooHigh bra END; + + cvta.to.global.u64 %InOutGlobal, %InOutGeneric; + mul.wide.s32 %GlobalByteOffset, %GlobalIndex, 4; + add.s64 %GlobalAddress, %InOutGlobal, %GlobalByteOffset; + ld.global.u32 %InitialValue, [%GlobalAddress]; + mul.wide.s32 %LocalWriteByteOffset, %LocalIndex, 4; + mov.u64 %SharedBaseAddr, SwapSpace; + add.s64 %SharedWriteAddr, %SharedBaseAddr, %LocalWriteByteOffset; + st.shared.u32 [%SharedWriteAddr], %InitialValue; + bar.sync 0; + xor.b32 %PartnerIndex, %LocalIndex, 1; + mul.wide.s32 %LocalReadByteOffset, %PartnerIndex, 4; + add.s64 %SharedReadAddr, %SharedBaseAddr, %LocalReadByteOffset; + ld.shared.u32 %SwappedValue, [%SharedReadAddr]; + st.global.u32 [%GlobalAddress], %SwappedValue; + + END: + ret; + } +)"; + +static streamexecutor::MultiKernelLoaderSpec SwapPairsLoaderSpec = []() { + streamexecutor::MultiKernelLoaderSpec Spec; + Spec.addCUDAPTXInMemory("SwapPairs", {{{2, 0}, SwapPairsPTX}}); + return Spec; +}(); +} // namespace compilergen + +namespace se = ::streamexecutor; +namespace cg = ::compilergen; + +class CUDATest : public ::testing::Test { +public: + CUDATest() + : Platform(getOrDie(se::PlatformManager::getPlatformByName("CUDA"))), + Device(getOrDie(Platform->getDevice(0))), + Stream(getOrDie(Device.createStream())) {} + + se::Platform *Platform; + se::Device Device; + se::Stream Stream; +}; + +TEST_F(CUDATest, Saxpy) { + float A = 42.0f; + std::vector HostX = {0, 1, 2, 3}; + std::vector HostY = {4, 5, 6, 7}; + size_t ArraySize = HostX.size(); + + cg::SaxpyKernel Kernel = + getOrDie(Device.createKernel(cg::SaxpyLoaderSpec)); + + se::RegisteredHostMemory RegisteredX = + getOrDie(Device.registerHostMemory(HostX)); + se::RegisteredHostMemory RegisteredY = + getOrDie(Device.registerHostMemory(HostY)); + + se::GlobalDeviceMemory X = + getOrDie(Device.allocateDeviceMemory(ArraySize)); + se::GlobalDeviceMemory Y = + getOrDie(Device.allocateDeviceMemory(ArraySize)); + + Stream.thenCopyH2D(RegisteredX, X) + .thenCopyH2D(RegisteredY, Y) + .thenLaunch(ArraySize, 1, Kernel, A, X, Y) + .thenCopyD2H(X, RegisteredX); + se::dieIfError(Stream.blockHostUntilDone()); + + std::vector ExpectedX = {4, 47, 90, 133}; + EXPECT_EQ(ExpectedX, HostX); +} + +TEST_F(CUDATest, DynamicSharedMemory) { + std::vector HostPairs = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + std::vector HostResult(HostPairs.size(), 0); + int ArraySize = HostPairs.size(); + + cg::SwapPairsKernel Kernel = getOrDie( + Device.createKernel(cg::SwapPairsLoaderSpec)); + + se::RegisteredHostMemory RegisteredPairs = + getOrDie(Device.registerHostMemory(HostPairs)); + se::RegisteredHostMemory RegisteredResult = + getOrDie(Device.registerHostMemory(HostResult)); + + se::GlobalDeviceMemory Pairs = + getOrDie(Device.allocateDeviceMemory(ArraySize)); + auto SharedMemory = + se::SharedDeviceMemory::makeFromElementCount(ArraySize); + + Stream.thenCopyH2D(RegisteredPairs, Pairs) + .thenLaunch(ArraySize, 1, Kernel, SharedMemory, Pairs, ArraySize) + .thenCopyD2H(Pairs, RegisteredResult); + se::dieIfError(Stream.blockHostUntilDone()); + + std::vector ExpectedPairs = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10}; + EXPECT_EQ(ExpectedPairs, HostResult); +} + +} // namespace