Index: streamexecutor/include/streamexecutor/DeviceMemory.h =================================================================== --- /dev/null +++ streamexecutor/include/streamexecutor/DeviceMemory.h @@ -0,0 +1,190 @@ +//===-- DeviceMemory.h - Types representing device memory -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines types that represent device memory allocations. +/// +/// DeviceMemoryBase is similar to a pair consisting of a void* pointer and a +/// byte count to tell how much memory is pointed to by that void*. +/// +/// DeviceMemory is a subclass of DeviceMemoryBase which keeps track of the +/// type of element to be stored in the device array. It is similar to a pair of +/// a T* pointer and an element count to tell how many elements of type T fit in +/// the memory pointed to by that T*. +/// +/// SharedDeviceMemory is a subclass of DeviceMemoryBase which knows how many +/// elements of type T it can hold, but does not have an opaque handle to the +/// device buffer. Since shared memory buffers are created during kernel launch +/// and destroyed when the kernel completes, host code can never have a handle +/// to shared memory. +/// +/// These wrapper classes serve to keep the size of a device allocation together +/// with the device handle to the memory from that allocation. The +/// DeviceMemory and SharedDeviceMemory classes also help make static type +/// checking possible for device allocations. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_DEVICEMEMORY_H +#define STREAMEXECUTOR_DEVICEMEMORY_H + +#include + +namespace streamexecutor { + +/// Wrapper around a generic device memory allocation. +/// +/// This class represents a buffer of untyped bytes in the memory of a device. +/// See DeviceMemory for the corresponding type that includes type +/// information for the elements in its buffer. +/// +/// This is effectively a pair consisting of an opaque handle and a buffer size +/// in bytes. The opaque handle is a platform-independent handle to the actual +/// memory that is allocated on the device. +/// +/// In some cases, such as in the CUDA platform, the opaque handle may actually +/// be a pointer in the virtual address space and it may be valid to perform +/// arithmetic on it to obtain other device pointers, but this is not the case +/// in general. +/// +/// For example, in the OpenCL platform, the handle is a pointer to a _cl_mem +/// handle object which really is completely opaque to the user. +/// +/// The only fully platform-generic operations on handles are using them to +/// create new DeviceMemoryBase objects, and comparing them to each other for +/// equality. +class DeviceMemoryBase { +public: + /// Creates a DeviceMemoryBase from an optional handle and an optional byte + /// count. + explicit DeviceMemoryBase(const void *Handle = nullptr, size_t ByteCount = 0) + : Handle(Handle), ByteCount(ByteCount) {} + + /// Copyable like a pointer. + DeviceMemoryBase(const DeviceMemoryBase &) = default; + + /// Assignment copyable like a pointer. + DeviceMemoryBase &operator=(const DeviceMemoryBase &) = default; + + /// Returns the size, in bytes, for the backing memory. + size_t getByteCount() const { return ByteCount; } + + /// Gets the internal handle. + /// + /// Warning: note that the pointer returned is not necessarily directly to + /// device virtual address space, but is platform-dependent. + const void *getHandle() const { return Handle; } + +private: + const void *Handle; // Platform-dependent value representing allocated memory. + size_t ByteCount; // Size in bytes of this allocation. +}; + +/// Typed wrapper around the "void *"-like DeviceMemoryBase class. +/// +/// For example, DeviceMemory is a simple wrapper around DeviceMemoryBase +/// that represents a buffer of integers stored in Device memory. +template class DeviceMemory : public DeviceMemoryBase { +public: + /// Creates a typed area of DeviceMemory with a given opaque handle and the + /// given element count. + static DeviceMemory makeFromElementCount(void *Handle, + size_t ElementCount) { + return DeviceMemory(Handle, ElementCount); + } + + /// Constructs a zero-sized memory region with a nullptr handle. + DeviceMemory() : DeviceMemoryBase(nullptr, 0) {} + + /// Creates a typed device memory region from an untyped device memory region. + /// + /// This effectively amounts to a cast from a void* to an ElemT*, but it also + /// manages the difference in the size measurements when DeviceMemoryBase is + /// measured in bytes and DeviceMemory is measured in elements. + explicit DeviceMemory(const DeviceMemoryBase &Other) + : DeviceMemoryBase(Other.getHandle(), Other.getByteCount()) {} + + /// Copyable like a pointer. + DeviceMemory(const DeviceMemory &) = default; + + /// Assignment copyable like a pointer. + DeviceMemory &operator=(const DeviceMemory &) = default; + + /// Returns the number of elements of type ElemT that constitute this + /// allocation. + size_t getElementCount() const { return getByteCount() / sizeof(ElemT); } + + /// Returns whether this is a single-element allocation. + bool isScalar() const { return getElementCount() == 1; } + +protected: + /// Constructs a DeviceMemory instance from an opaque handle and an element + /// count. + /// + /// This constructor is not public because there is a potential for confusion + /// between the size of the buffer in bytes and the size of the buffer in + /// elements. + /// + /// The static method makeFromElementCount is provided for users of this class + /// because it's name makes the meaning of the size paramter clear. + DeviceMemory(void *Handle, size_t ElementCount) + : DeviceMemoryBase(Handle, ElementCount * sizeof(ElemT)) {} +}; + +/// A class to represent the type and size of a dynamic shared memory buffer on +/// a device. +/// +/// Shared memory buffers exist only on the device and cannot be manipulated +/// from the host, so instances of this class do not have an opaque handle, only +/// a size. +/// +/// This type of memory is called "local" memory in OpenCL and "shared" memory +/// in CUDA, and both platforms follow the rule that the host code only knows +/// the size of these buffers and does not have a handle to them. +/// +/// The treatment of shared memory in StreamExecutor matches the way it is done +/// in OpenCL where a kernel takes any number of shared memory sizes as kernel +/// function arguments. +/// +/// In CUDA only one shared memory size argument is allowed per kernel call. +/// StreamExecutor handles this by allowing CUDA kernel signatures that take +/// multiple SharedDeviceMemory arguments, and by simply adding together all the +/// shared memory sizes to get the final shared memory size that is used to +/// launch the kernel. +template +class SharedDeviceMemory : public DeviceMemory { +public: + /// Creates a typed area of SharedDeviceMemory with a given number of + /// elements. + static SharedDeviceMemory makeFromElementCount(size_t ElementCount) { + return SharedDeviceMemory(ElementCount); + } + + /// Copyable because it is just an array size. + SharedDeviceMemory(const SharedDeviceMemory &) = default; + + /// Assignment copyable because it is just an array size. + SharedDeviceMemory &operator=(const SharedDeviceMemory &) = default; + +private: + /// Constructs a SharedDeviceMemory instance from an element count. + /// + /// This constructor is not public because there is a potential for confusion + /// between the size of the buffer in bytes and the size of the buffer in + /// elements. + /// + /// The static method makeFromElementCount is provided for users of this class + /// because it's name makes the meaning of the size paramter clear. + explicit SharedDeviceMemory(size_t ElementCount) + : DeviceMemory(nullptr, ElementCount) {} +}; + +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_DEVICEMEMORY_H Index: streamexecutor/include/streamexecutor/PackedKernelArgumentArray.h =================================================================== --- /dev/null +++ streamexecutor/include/streamexecutor/PackedKernelArgumentArray.h @@ -0,0 +1,202 @@ +//===-- PackedKernelArgumentArray.h - Packed kernel arg types ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// The types in this file are designed to deal with the fact that DeviceMemory +/// kernel arguments are treated differently from other arguments during kernel +/// argument packing. A non-DeviceMemory argument is passed to a kernel by +/// specifying its address and maybe its size in bytes, depending on the +/// platform, but DeviceMemory arguments require different steps. +/// +/// DeviceMemory arguments are passed to a kernel by passing their opaque +/// handle. SharedDeviceMemory arguments have no associated address, only a +/// size, so the size is the only information that gets passed to the kernel +/// launch. +/// +/// The KernelArgumentType enum is used to keep track of the type of each +/// argument. +/// +/// The PackedKernelArgumentArray class uses template metaprogramming to convert +/// each argument to a PackedKernelArgument with minimal runtime overhead. +/// +/// The design of the PackedKernelArgumentArray class has a few idiosyncrasies +/// due to the fact that parameter packing has been identified as +/// performance-critical in some applications. The packed argument data is +/// stored as a struct of arrays rather than an array of structs because CUDA +/// kernel launches take an array of argument addresses. Having created the +/// array of argument addresses here, no further work will need to be done in +/// the CUDA layer to unpack and repack the addresses. +/// +/// The shared memory argument count is maintained because in the common case +/// where it is zero, the CUDA layer doesn't have to loop through the argument +/// array and sum up all the shared memory sizes. This is another performance +/// optimization that shows up as a quirk in this class interface. +/// +/// The platform-interface kernel launch function will take the following +/// arguments, which are provided by this interface: +/// * argument count, +/// * array of argument address, +/// * array of argument sizes, +/// * array of argument types, and +/// * shared pointer count. +/// This information should be enough to allow any platform to launch the kernel +/// efficiently, although it is probably more information than is needed for any +/// specific platform. +/// +//===----------------------------------------------------------------------===// + +#ifndef STREAMEXECUTOR_PACKEDKERNELARGUMENTARRAY_H +#define STREAMEXECUTOR_PACKEDKERNELARGUMENTARRAY_H + +#include + +#include "streamexecutor/DeviceMemory.h" + +namespace streamexecutor { + +enum class KernelArgumentType { + VALUE, /// Non-DeviceMemory argument. + GLOBAL_DEVICE_MEMORY, /// Non-shared DeviceMemory argument. + SHARED_DEVICE_MEMORY /// SharedDeviceMemory argument. +}; + +/// An array of packed kernel arguments. +template class PackedKernelArgumentArray { +public: + /// Constructs an instance by packing the specified arguments. + PackedKernelArgumentArray(const ParameterTs &... Arguments) + : SharedCount(0u) { + PackArguments(0, Arguments...); + } + + /// Gets the number of packed arguments. + size_t getArgumentCount() const { return sizeof...(ParameterTs); } + + /// Gets the address of the argument at the given index. + const void *getAddress(size_t Index) const { return Addresses[Index]; } + + /// Gets the size of the argument at the given index. + size_t getSize(size_t Index) const { return Sizes[Index]; } + + /// Gets the type of the argument at the given index. + KernelArgumentType getType(size_t Index) const { return Types[Index]; } + + /// Gets a pointer to the address array. + const void *const *getAddresses() const { return Addresses.data(); } + + /// Gets a pointer to the sizes array. + size_t *getSizes() const { return Sizes.data(); } + + /// Gets a pointer to the types array. + KernelArgumentType *getTypes() const { return Types.data(); } + + /// Gets the number of SharedDeviceMemory arguments. + size_t getSharedCount() const { return SharedCount; } + +private: + // Base case for PackArguments when there are no arguments to pack. + void PackArguments(size_t) {} + + // Induction step for PackArguments. + template + void PackArguments(size_t Index, const T &Argument, + const RemainingParameterTs &... RemainingArguments) { + PackOneArgument(Index, Argument); + PackArguments(Index + 1, RemainingArguments...); + } + + // Pack a normal, non-DeviceMemory argument. + template void PackOneArgument(size_t Index, const T &Argument) { + Addresses[Index] = &Argument; + Sizes[Index] = sizeof(T); + Types[Index] = KernelArgumentType::VALUE; + } + + // Pack a DeviceMemoryBase argument. + void PackOneArgument(size_t Index, const DeviceMemoryBase &Argument) { + Addresses[Index] = Argument.getHandle(); + Sizes[Index] = sizeof(void *); + Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY; + } + + // Pack a DeviceMemoryBase pointer argument. + void PackOneArgument(size_t Index, DeviceMemoryBase *const &Argument) { + Addresses[Index] = Argument->getHandle(); + Sizes[Index] = sizeof(void *); + Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY; + } + + // Pack a const DeviceMemoryBase pointer argument. + void PackOneArgument(size_t Index, const DeviceMemoryBase *const &Argument) { + Addresses[Index] = Argument->getHandle(); + Sizes[Index] = sizeof(void *); + Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY; + } + + // Pack a DeviceMemory argument. + template + void PackOneArgument(size_t Index, const DeviceMemory &Argument) { + Addresses[Index] = Argument.getHandle(); + Sizes[Index] = sizeof(void *); + Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY; + } + + // Pack a DeviceMemory pointer argument. + template + void PackOneArgument(size_t Index, DeviceMemory *const &Argument) { + Addresses[Index] = Argument->getHandle(); + Sizes[Index] = sizeof(void *); + Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY; + } + + // Pack a const DeviceMemory pointer argument. + template + void PackOneArgument(size_t Index, const DeviceMemory *const &Argument) { + Addresses[Index] = Argument->getHandle(); + Sizes[Index] = sizeof(void *); + Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY; + } + + // Pack a SharedDeviceMemory argument. + template + void PackOneArgument(size_t Index, const SharedDeviceMemory &Argument) { + ++SharedCount; + Addresses[Index] = nullptr; + Sizes[Index] = Argument.getByteCount(); + Types[Index] = KernelArgumentType::SHARED_DEVICE_MEMORY; + } + + // Pack a SharedDeviceMemory pointer argument. + template + void PackOneArgument(size_t Index, SharedDeviceMemory *const &Argument) { + ++SharedCount; + Addresses[Index] = nullptr; + Sizes[Index] = Argument->getByteCount(); + Types[Index] = KernelArgumentType::SHARED_DEVICE_MEMORY; + } + + // Pack a const SharedDeviceMemory pointer argument. + template + void PackOneArgument(size_t Index, + const SharedDeviceMemory *const &Argument) { + ++SharedCount; + Addresses[Index] = nullptr; + Sizes[Index] = Argument->getByteCount(); + Types[Index] = KernelArgumentType::SHARED_DEVICE_MEMORY; + } + + std::array Addresses; + std::array Sizes; + std::array Types; + size_t SharedCount; +}; + +} // namespace streamexecutor + +#endif // STREAMEXECUTOR_PACKEDKERNELARGUMENTARRAY_H Index: streamexecutor/lib/unittests/CMakeLists.txt =================================================================== --- streamexecutor/lib/unittests/CMakeLists.txt +++ streamexecutor/lib/unittests/CMakeLists.txt @@ -17,3 +17,12 @@ ${GTEST_BOTH_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) add_test(KernelSpecTest kernel_spec_test) + +add_executable( + packed_kernel_argument_array_test + PackedKernelArgumentArrayTest.cpp) +target_link_libraries( + packed_kernel_argument_array_test + ${GTEST_BOTH_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT}) +add_test(PackedKernelArgumentArrayTest packed_kernel_argument_array_test) Index: streamexecutor/lib/unittests/PackedKernelArgumentArrayTest.cpp =================================================================== --- /dev/null +++ streamexecutor/lib/unittests/PackedKernelArgumentArrayTest.cpp @@ -0,0 +1,170 @@ +//===-- PackedKernelArgumentArrayTest.cpp - tests for kernel arg packing --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Unit tests for kernel argument packing. +/// +//===----------------------------------------------------------------------===// + +#include "streamexecutor/DeviceMemory.h" +#include "streamexecutor/PackedKernelArgumentArray.h" + +#include "gtest/gtest.h" + +namespace { + +namespace se = ::streamexecutor; + +using Type = se::KernelArgumentType; + +// Utility template function to call the PackedKernelArgumentArray constructor +// with the template arguments matching the types of the arguments passed to +// this function. +template +se::PackedKernelArgumentArray +pack(const ParameterTs &... Arguments) { + return se::PackedKernelArgumentArray(Arguments...); +} + +// Test fixture class for testing argument packing. +// +// Basically defines a bunch of types to be packed so they don't have to be +// defined separately in each test. +class DeviceMemoryPackingTest : public ::testing::Test { +public: + DeviceMemoryPackingTest() + : Value(42), Handle(&Value), ByteCount(15), ElementCount(5), + DeviceMemoryBase(Handle, ByteCount), + DeviceMemory( + se::DeviceMemory::makeFromElementCount(Handle, ElementCount)), + SharedDeviceMemory( + se::SharedDeviceMemory::makeFromElementCount(ElementCount)) {} + + int Value; + void *Handle; + size_t ByteCount; + size_t ElementCount; + se::DeviceMemoryBase DeviceMemoryBase; + se::DeviceMemory DeviceMemory; + se::SharedDeviceMemory SharedDeviceMemory; +}; + +// Utility method to check the expected address, size, and type for a packed +// argument at the given index of a PackedKernelArgumentArray. +template +static void +ExpectEqual(const void *ExpectedAddress, size_t ExpectedSize, Type ExpectedType, + const se::PackedKernelArgumentArray &Observed, + size_t Index) { + EXPECT_EQ(ExpectedAddress, Observed.getAddress(Index)) << "Index: " << Index; + EXPECT_EQ(ExpectedSize, Observed.getSize(Index)) << "Index: " << Index; + EXPECT_EQ(ExpectedType, Observed.getType(Index)) << "Index: " << Index; +} + +TEST_F(DeviceMemoryPackingTest, SingleArgumentValue) { + auto Array = pack(Value); + ExpectEqual(&Value, sizeof(Value), Type::VALUE, Array, 0); + EXPECT_EQ(1u, Array.getArgumentCount()); + EXPECT_EQ(0u, Array.getSharedCount()); +} + +TEST_F(DeviceMemoryPackingTest, SingleDeviceMemoryBase) { + auto Array = pack(DeviceMemoryBase); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0); + EXPECT_EQ(1u, Array.getArgumentCount()); + EXPECT_EQ(0u, Array.getSharedCount()); +} + +TEST_F(DeviceMemoryPackingTest, SingleDeviceMemoryBasePointer) { + auto Array = pack(&DeviceMemoryBase); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0); + EXPECT_EQ(1u, Array.getArgumentCount()); + EXPECT_EQ(0u, Array.getSharedCount()); +} + +TEST_F(DeviceMemoryPackingTest, SingleConstDeviceMemoryBasePointer) { + const se::DeviceMemoryBase *ConstPointer = &DeviceMemoryBase; + auto Array = pack(ConstPointer); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0); + EXPECT_EQ(1u, Array.getArgumentCount()); + EXPECT_EQ(0u, Array.getSharedCount()); +} + +TEST_F(DeviceMemoryPackingTest, SingleDeviceMemory) { + auto Array = pack(DeviceMemory); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0); + EXPECT_EQ(1u, Array.getArgumentCount()); + EXPECT_EQ(0u, Array.getSharedCount()); +} + +TEST_F(DeviceMemoryPackingTest, SingleDeviceMemoryPointer) { + auto Array = pack(&DeviceMemory); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0); + EXPECT_EQ(1u, Array.getArgumentCount()); + EXPECT_EQ(0u, Array.getSharedCount()); +} + +TEST_F(DeviceMemoryPackingTest, SingleConstDeviceMemoryPointer) { + const se::DeviceMemory *ArgumentPointer = &DeviceMemory; + auto Array = pack(ArgumentPointer); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0); + EXPECT_EQ(1u, Array.getArgumentCount()); + EXPECT_EQ(0u, Array.getSharedCount()); +} + +TEST_F(DeviceMemoryPackingTest, SingleSharedDeviceMemory) { + auto Array = pack(SharedDeviceMemory); + ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(), + Type::SHARED_DEVICE_MEMORY, Array, 0); + EXPECT_EQ(1u, Array.getArgumentCount()); + EXPECT_EQ(1u, Array.getSharedCount()); +} + +TEST_F(DeviceMemoryPackingTest, SingleSharedDeviceMemoryPointer) { + auto Array = pack(&SharedDeviceMemory); + ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(), + Type::SHARED_DEVICE_MEMORY, Array, 0); + EXPECT_EQ(1u, Array.getArgumentCount()); + EXPECT_EQ(1u, Array.getSharedCount()); +} + +TEST_F(DeviceMemoryPackingTest, SingleConstSharedDeviceMemoryPointer) { + const se::SharedDeviceMemory *ArgumentPointer = &SharedDeviceMemory; + auto Array = pack(ArgumentPointer); + ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(), + Type::SHARED_DEVICE_MEMORY, Array, 0); + EXPECT_EQ(1u, Array.getArgumentCount()); + EXPECT_EQ(1u, Array.getSharedCount()); +} + +TEST_F(DeviceMemoryPackingTest, PackSeveralArguments) { + const se::DeviceMemoryBase *BasePointer = &DeviceMemoryBase; + const se::DeviceMemory *TypedPointer = &DeviceMemory; + const se::SharedDeviceMemory *SharedPointer = &SharedDeviceMemory; + auto Array = pack(Value, DeviceMemoryBase, &DeviceMemoryBase, BasePointer, + DeviceMemory, &DeviceMemory, TypedPointer, + SharedDeviceMemory, &SharedDeviceMemory, SharedPointer); + ExpectEqual(&Value, sizeof(Value), Type::VALUE, Array, 0); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 1); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 2); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 3); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 4); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 5); + ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 6); + ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(), + Type::SHARED_DEVICE_MEMORY, Array, 7); + ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(), + Type::SHARED_DEVICE_MEMORY, Array, 8); + ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(), + Type::SHARED_DEVICE_MEMORY, Array, 9); + EXPECT_EQ(10u, Array.getArgumentCount()); + EXPECT_EQ(3u, Array.getSharedCount()); +} + +} // namespace