Index: streamexecutor/include/streamexecutor/DeviceMemory.h
===================================================================
--- /dev/null
+++ streamexecutor/include/streamexecutor/DeviceMemory.h
@@ -0,0 +1,190 @@
+//===-- DeviceMemory.h - Types representing device memory -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines types that represent device memory allocations.
+///
+/// DeviceMemoryBase is similar to a pair consisting of a void* pointer and a
+/// byte count to tell how much memory is pointed to by that void*.
+///
+/// DeviceMemory<T> is a subclass of DeviceMemoryBase which keeps track of the
+/// type of element to be stored in the device array. It is similar to a pair of
+/// a T* pointer and an element count to tell how many elements of type T fit in
+/// the memory pointed to by that T*.
+///
+/// SharedDeviceMemory<T> is a subclass of DeviceMemoryBase which knows how many
+/// elements of type T it can hold, but does not have an opaque handle to the
+/// device buffer. Since shared memory buffers are created during kernel launch
+/// and destroyed when the kernel completes, host code can never have a handle
+/// to shared memory.
+///
+/// These wrapper classes serve to keep the size of a device allocation together
+/// with the device handle to the memory from that allocation. The
+/// DeviceMemory<T> and SharedDeviceMemory<T> classes also help make static type
+/// checking possible for device allocations.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_DEVICEMEMORY_H
+#define STREAMEXECUTOR_DEVICEMEMORY_H
+
+#include <cstddef>
+
+namespace streamexecutor {
+
+/// Wrapper around a generic device memory allocation.
+///
+/// This class represents a buffer of untyped bytes in the memory of a device.
+/// See DeviceMemory<T> for the corresponding type that includes type
+/// information for the elements in its buffer.
+///
+/// This is effectively a pair consisting of an opaque handle and a buffer size
+/// in bytes. The opaque handle is a platform-independent handle to the actual
+/// memory that is allocated on the device.
+///
+/// In some cases, such as in the CUDA platform, the opaque handle may actually
+/// be a pointer in the virtual address space and it may be valid to perform
+/// arithmetic on it to obtain other device pointers, but this is not the case
+/// in general.
+///
+/// For example, in the OpenCL platform, the handle is a pointer to a _cl_mem
+/// handle object which really is completely opaque to the user.
+///
+/// The only fully platform-generic operations on handles are using them to
+/// create new DeviceMemoryBase objects, and comparing them to each other for
+/// equality.
+class DeviceMemoryBase {
+public:
+  /// Creates a DeviceMemoryBase from an optional handle and an optional byte
+  /// count.
+  explicit DeviceMemoryBase(const void *Handle = nullptr, size_t ByteCount = 0)
+      : Handle(Handle), ByteCount(ByteCount) {}
+
+  /// Copyable like a pointer.
+  DeviceMemoryBase(const DeviceMemoryBase &) = default;
+
+  /// Assignment copyable like a pointer.
+  DeviceMemoryBase &operator=(const DeviceMemoryBase &) = default;
+
+  /// Returns the size, in bytes, for the backing memory.
+  size_t getByteCount() const { return ByteCount; }
+
+  /// Gets the internal handle.
+  ///
+  /// Warning: note that the pointer returned is not necessarily directly to
+  /// device virtual address space, but is platform-dependent.
+  const void *getHandle() const { return Handle; }
+
+private:
+  const void *Handle; // Platform-dependent value representing allocated memory.
+  size_t ByteCount;   // Size in bytes of this allocation.
+};
+
+/// Typed wrapper around the "void *"-like DeviceMemoryBase class.
+///
+/// For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase
+/// that represents a buffer of integers stored in Device memory.
+template <typename ElemT> class DeviceMemory : public DeviceMemoryBase {
+public:
+  /// Creates a typed area of DeviceMemory with a given opaque handle and the
+  /// given element count.
+  static DeviceMemory<ElemT> makeFromElementCount(void *Handle,
+                                                  size_t ElementCount) {
+    return DeviceMemory<ElemT>(Handle, ElementCount);
+  }
+
+  /// Constructs a zero-sized memory region with a nullptr handle.
+  DeviceMemory() : DeviceMemoryBase(nullptr, 0) {}
+
+  /// Creates a typed device memory region from an untyped device memory region.
+  ///
+  /// This effectively amounts to a cast from a void* to an ElemT*, but it also
+  /// manages the difference in the size measurements when DeviceMemoryBase is
+  /// measured in bytes and DeviceMemory is measured in elements.
+  explicit DeviceMemory(const DeviceMemoryBase &Other)
+      : DeviceMemoryBase(Other.getHandle(), Other.getByteCount()) {}
+
+  /// Copyable like a pointer.
+  DeviceMemory(const DeviceMemory &) = default;
+
+  /// Assignment copyable like a pointer.
+  DeviceMemory &operator=(const DeviceMemory &) = default;
+
+  /// Returns the number of elements of type ElemT that constitute this
+  /// allocation.
+  size_t getElementCount() const { return getByteCount() / sizeof(ElemT); }
+
+  /// Returns whether this is a single-element allocation.
+  bool isScalar() const { return getElementCount() == 1; }
+
+protected:
+  /// Constructs a DeviceMemory instance from an opaque handle and an element
+  /// count.
+  ///
+  /// This constructor is not public because there is a potential for confusion
+  /// between the size of the buffer in bytes and the size of the buffer in
+  /// elements.
+  ///
+  /// The static method makeFromElementCount is provided for users of this class
+  /// because it's name makes the meaning of the size paramter clear.
+  DeviceMemory(void *Handle, size_t ElementCount)
+      : DeviceMemoryBase(Handle, ElementCount * sizeof(ElemT)) {}
+};
+
+/// A class to represent the type and size of a dynamic shared memory buffer on
+/// a device.
+///
+/// Shared memory buffers exist only on the device and cannot be manipulated
+/// from the host, so instances of this class do not have an opaque handle, only
+/// a size.
+///
+/// This type of memory is called "local" memory in OpenCL and "shared" memory
+/// in CUDA, and both platforms follow the rule that the host code only knows
+/// the size of these buffers and does not have a handle to them.
+///
+/// The treatment of shared memory in StreamExecutor matches the way it is done
+/// in OpenCL where a kernel takes any number of shared memory sizes as kernel
+/// function arguments.
+///
+/// In CUDA only one shared memory size argument is allowed per kernel call.
+/// StreamExecutor handles this by allowing CUDA kernel signatures that take
+/// multiple SharedDeviceMemory arguments, and by simply adding together all the
+/// shared memory sizes to get the final shared memory size that is used to
+/// launch the kernel.
+template <typename ElemT>
+class SharedDeviceMemory : public DeviceMemory<ElemT> {
+public:
+  /// Creates a typed area of SharedDeviceMemory with a given number of
+  /// elements.
+  static SharedDeviceMemory<ElemT> makeFromElementCount(size_t ElementCount) {
+    return SharedDeviceMemory(ElementCount);
+  }
+
+  /// Copyable because it is just an array size.
+  SharedDeviceMemory(const SharedDeviceMemory &) = default;
+
+  /// Assignment copyable because it is just an array size.
+  SharedDeviceMemory &operator=(const SharedDeviceMemory &) = default;
+
+private:
+  /// Constructs a SharedDeviceMemory instance from an element count.
+  ///
+  /// This constructor is not public because there is a potential for confusion
+  /// between the size of the buffer in bytes and the size of the buffer in
+  /// elements.
+  ///
+  /// The static method makeFromElementCount is provided for users of this class
+  /// because it's name makes the meaning of the size paramter clear.
+  explicit SharedDeviceMemory(size_t ElementCount)
+      : DeviceMemory<ElemT>(nullptr, ElementCount) {}
+};
+
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_DEVICEMEMORY_H
Index: streamexecutor/include/streamexecutor/PackedKernelArgumentArray.h
===================================================================
--- /dev/null
+++ streamexecutor/include/streamexecutor/PackedKernelArgumentArray.h
@@ -0,0 +1,202 @@
+//===-- PackedKernelArgumentArray.h - Packed kernel arg types ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// The types in this file are designed to deal with the fact that DeviceMemory
+/// kernel arguments are treated differently from other arguments during kernel
+/// argument packing. A non-DeviceMemory argument is passed to a kernel by
+/// specifying its address and maybe its size in bytes, depending on the
+/// platform, but DeviceMemory arguments require different steps.
+///
+/// DeviceMemory<T> arguments are passed to a kernel by passing their opaque
+/// handle. SharedDeviceMemory<T> arguments have no associated address, only a
+/// size, so the size is the only information that gets passed to the kernel
+/// launch.
+///
+/// The KernelArgumentType enum is used to keep track of the type of each
+/// argument.
+///
+/// The PackedKernelArgumentArray class uses template metaprogramming to convert
+/// each argument to a PackedKernelArgument with minimal runtime overhead.
+///
+/// The design of the PackedKernelArgumentArray class has a few idiosyncrasies
+/// due to the fact that parameter packing has been identified as
+/// performance-critical in some applications. The packed argument data is
+/// stored as a struct of arrays rather than an array of structs because CUDA
+/// kernel launches take an array of argument addresses. Having created the
+/// array of argument addresses here, no further work will need to be done in
+/// the CUDA layer to unpack and repack the addresses.
+///
+/// The shared memory argument count is maintained because in the common case
+/// where it is zero, the CUDA layer doesn't have to loop through the argument
+/// array and sum up all the shared memory sizes. This is another performance
+/// optimization that shows up as a quirk in this class interface.
+///
+/// The platform-interface kernel launch function will take the following
+/// arguments, which are provided by this interface:
+/// * argument count,
+/// * array of argument address,
+/// * array of argument sizes,
+/// * array of argument types, and
+/// * shared pointer count.
+/// This information should be enough to allow any platform to launch the kernel
+/// efficiently, although it is probably more information than is needed for any
+/// specific platform.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PACKEDKERNELARGUMENTARRAY_H
+#define STREAMEXECUTOR_PACKEDKERNELARGUMENTARRAY_H
+
+#include <array>
+
+#include "streamexecutor/DeviceMemory.h"
+
+namespace streamexecutor {
+
+enum class KernelArgumentType {
+  VALUE,                /// Non-DeviceMemory argument.
+  GLOBAL_DEVICE_MEMORY, /// Non-shared DeviceMemory argument.
+  SHARED_DEVICE_MEMORY  /// SharedDeviceMemory argument.
+};
+
+/// An array of packed kernel arguments.
+template <typename... ParameterTs> class PackedKernelArgumentArray {
+public:
+  /// Constructs an instance by packing the specified arguments.
+  PackedKernelArgumentArray(const ParameterTs &... Arguments)
+      : SharedCount(0u) {
+    PackArguments(0, Arguments...);
+  }
+
+  /// Gets the number of packed arguments.
+  size_t getArgumentCount() const { return sizeof...(ParameterTs); }
+
+  /// Gets the address of the argument at the given index.
+  const void *getAddress(size_t Index) const { return Addresses[Index]; }
+
+  /// Gets the size of the argument at the given index.
+  size_t getSize(size_t Index) const { return Sizes[Index]; }
+
+  /// Gets the type of the argument at the given index.
+  KernelArgumentType getType(size_t Index) const { return Types[Index]; }
+
+  /// Gets a pointer to the address array.
+  const void *const *getAddresses() const { return Addresses.data(); }
+
+  /// Gets a pointer to the sizes array.
+  size_t *getSizes() const { return Sizes.data(); }
+
+  /// Gets a pointer to the types array.
+  KernelArgumentType *getTypes() const { return Types.data(); }
+
+  /// Gets the number of SharedDeviceMemory arguments.
+  size_t getSharedCount() const { return SharedCount; }
+
+private:
+  // Base case for PackArguments when there are no arguments to pack.
+  void PackArguments(size_t) {}
+
+  // Induction step for PackArguments.
+  template <typename T, typename... RemainingParameterTs>
+  void PackArguments(size_t Index, const T &Argument,
+                     const RemainingParameterTs &... RemainingArguments) {
+    PackOneArgument(Index, Argument);
+    PackArguments(Index + 1, RemainingArguments...);
+  }
+
+  // Pack a normal, non-DeviceMemory argument.
+  template <typename T> void PackOneArgument(size_t Index, const T &Argument) {
+    Addresses[Index] = &Argument;
+    Sizes[Index] = sizeof(T);
+    Types[Index] = KernelArgumentType::VALUE;
+  }
+
+  // Pack a DeviceMemoryBase argument.
+  void PackOneArgument(size_t Index, const DeviceMemoryBase &Argument) {
+    Addresses[Index] = Argument.getHandle();
+    Sizes[Index] = sizeof(void *);
+    Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY;
+  }
+
+  // Pack a DeviceMemoryBase pointer argument.
+  void PackOneArgument(size_t Index, DeviceMemoryBase *const &Argument) {
+    Addresses[Index] = Argument->getHandle();
+    Sizes[Index] = sizeof(void *);
+    Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY;
+  }
+
+  // Pack a const DeviceMemoryBase pointer argument.
+  void PackOneArgument(size_t Index, const DeviceMemoryBase *const &Argument) {
+    Addresses[Index] = Argument->getHandle();
+    Sizes[Index] = sizeof(void *);
+    Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY;
+  }
+
+  // Pack a DeviceMemory<T> argument.
+  template <typename T>
+  void PackOneArgument(size_t Index, const DeviceMemory<T> &Argument) {
+    Addresses[Index] = Argument.getHandle();
+    Sizes[Index] = sizeof(void *);
+    Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY;
+  }
+
+  // Pack a DeviceMemory<T> pointer argument.
+  template <typename T>
+  void PackOneArgument(size_t Index, DeviceMemory<T> *const &Argument) {
+    Addresses[Index] = Argument->getHandle();
+    Sizes[Index] = sizeof(void *);
+    Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY;
+  }
+
+  // Pack a const DeviceMemory<T> pointer argument.
+  template <typename T>
+  void PackOneArgument(size_t Index, const DeviceMemory<T> *const &Argument) {
+    Addresses[Index] = Argument->getHandle();
+    Sizes[Index] = sizeof(void *);
+    Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY;
+  }
+
+  // Pack a SharedDeviceMemory argument.
+  template <typename T>
+  void PackOneArgument(size_t Index, const SharedDeviceMemory<T> &Argument) {
+    ++SharedCount;
+    Addresses[Index] = nullptr;
+    Sizes[Index] = Argument.getByteCount();
+    Types[Index] = KernelArgumentType::SHARED_DEVICE_MEMORY;
+  }
+
+  // Pack a SharedDeviceMemory pointer argument.
+  template <typename T>
+  void PackOneArgument(size_t Index, SharedDeviceMemory<T> *const &Argument) {
+    ++SharedCount;
+    Addresses[Index] = nullptr;
+    Sizes[Index] = Argument->getByteCount();
+    Types[Index] = KernelArgumentType::SHARED_DEVICE_MEMORY;
+  }
+
+  // Pack a const SharedDeviceMemory pointer argument.
+  template <typename T>
+  void PackOneArgument(size_t Index,
+                       const SharedDeviceMemory<T> *const &Argument) {
+    ++SharedCount;
+    Addresses[Index] = nullptr;
+    Sizes[Index] = Argument->getByteCount();
+    Types[Index] = KernelArgumentType::SHARED_DEVICE_MEMORY;
+  }
+
+  std::array<const void *, sizeof...(ParameterTs)> Addresses;
+  std::array<size_t, sizeof...(ParameterTs)> Sizes;
+  std::array<KernelArgumentType, sizeof...(ParameterTs)> Types;
+  size_t SharedCount;
+};
+
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PACKEDKERNELARGUMENTARRAY_H
Index: streamexecutor/lib/unittests/CMakeLists.txt
===================================================================
--- streamexecutor/lib/unittests/CMakeLists.txt
+++ streamexecutor/lib/unittests/CMakeLists.txt
@@ -17,3 +17,12 @@
     ${GTEST_BOTH_LIBRARIES}
     ${CMAKE_THREAD_LIBS_INIT})
 add_test(KernelSpecTest kernel_spec_test)
+
+add_executable(
+    packed_kernel_argument_array_test
+    PackedKernelArgumentArrayTest.cpp)
+target_link_libraries(
+    packed_kernel_argument_array_test
+    ${GTEST_BOTH_LIBRARIES}
+    ${CMAKE_THREAD_LIBS_INIT})
+add_test(PackedKernelArgumentArrayTest packed_kernel_argument_array_test)
Index: streamexecutor/lib/unittests/PackedKernelArgumentArrayTest.cpp
===================================================================
--- /dev/null
+++ streamexecutor/lib/unittests/PackedKernelArgumentArrayTest.cpp
@@ -0,0 +1,170 @@
+//===-- PackedKernelArgumentArrayTest.cpp - tests for kernel arg packing --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Unit tests for kernel argument packing.
+///
+//===----------------------------------------------------------------------===//
+
+#include "streamexecutor/DeviceMemory.h"
+#include "streamexecutor/PackedKernelArgumentArray.h"
+
+#include "gtest/gtest.h"
+
+namespace {
+
+namespace se = ::streamexecutor;
+
+using Type = se::KernelArgumentType;
+
+// Utility template function to call the PackedKernelArgumentArray constructor
+// with the template arguments matching the types of the arguments passed to
+// this function.
+template <typename... ParameterTs>
+se::PackedKernelArgumentArray<ParameterTs...>
+pack(const ParameterTs &... Arguments) {
+  return se::PackedKernelArgumentArray<ParameterTs...>(Arguments...);
+}
+
+// Test fixture class for testing argument packing.
+//
+// Basically defines a bunch of types to be packed so they don't have to be
+// defined separately in each test.
+class DeviceMemoryPackingTest : public ::testing::Test {
+public:
+  DeviceMemoryPackingTest()
+      : Value(42), Handle(&Value), ByteCount(15), ElementCount(5),
+        DeviceMemoryBase(Handle, ByteCount),
+        DeviceMemory(
+            se::DeviceMemory<int>::makeFromElementCount(Handle, ElementCount)),
+        SharedDeviceMemory(
+            se::SharedDeviceMemory<int>::makeFromElementCount(ElementCount)) {}
+
+  int Value;
+  void *Handle;
+  size_t ByteCount;
+  size_t ElementCount;
+  se::DeviceMemoryBase DeviceMemoryBase;
+  se::DeviceMemory<int> DeviceMemory;
+  se::SharedDeviceMemory<int> SharedDeviceMemory;
+};
+
+// Utility method to check the expected address, size, and type for a packed
+// argument at the given index of a PackedKernelArgumentArray.
+template <typename... ParameterTs>
+static void
+ExpectEqual(const void *ExpectedAddress, size_t ExpectedSize, Type ExpectedType,
+            const se::PackedKernelArgumentArray<ParameterTs...> &Observed,
+            size_t Index) {
+  EXPECT_EQ(ExpectedAddress, Observed.getAddress(Index)) << "Index: " << Index;
+  EXPECT_EQ(ExpectedSize, Observed.getSize(Index)) << "Index: " << Index;
+  EXPECT_EQ(ExpectedType, Observed.getType(Index)) << "Index: " << Index;
+}
+
+TEST_F(DeviceMemoryPackingTest, SingleArgumentValue) {
+  auto Array = pack(Value);
+  ExpectEqual(&Value, sizeof(Value), Type::VALUE, Array, 0);
+  EXPECT_EQ(1u, Array.getArgumentCount());
+  EXPECT_EQ(0u, Array.getSharedCount());
+}
+
+TEST_F(DeviceMemoryPackingTest, SingleDeviceMemoryBase) {
+  auto Array = pack(DeviceMemoryBase);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0);
+  EXPECT_EQ(1u, Array.getArgumentCount());
+  EXPECT_EQ(0u, Array.getSharedCount());
+}
+
+TEST_F(DeviceMemoryPackingTest, SingleDeviceMemoryBasePointer) {
+  auto Array = pack(&DeviceMemoryBase);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0);
+  EXPECT_EQ(1u, Array.getArgumentCount());
+  EXPECT_EQ(0u, Array.getSharedCount());
+}
+
+TEST_F(DeviceMemoryPackingTest, SingleConstDeviceMemoryBasePointer) {
+  const se::DeviceMemoryBase *ConstPointer = &DeviceMemoryBase;
+  auto Array = pack(ConstPointer);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0);
+  EXPECT_EQ(1u, Array.getArgumentCount());
+  EXPECT_EQ(0u, Array.getSharedCount());
+}
+
+TEST_F(DeviceMemoryPackingTest, SingleDeviceMemory) {
+  auto Array = pack(DeviceMemory);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0);
+  EXPECT_EQ(1u, Array.getArgumentCount());
+  EXPECT_EQ(0u, Array.getSharedCount());
+}
+
+TEST_F(DeviceMemoryPackingTest, SingleDeviceMemoryPointer) {
+  auto Array = pack(&DeviceMemory);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0);
+  EXPECT_EQ(1u, Array.getArgumentCount());
+  EXPECT_EQ(0u, Array.getSharedCount());
+}
+
+TEST_F(DeviceMemoryPackingTest, SingleConstDeviceMemoryPointer) {
+  const se::DeviceMemory<int> *ArgumentPointer = &DeviceMemory;
+  auto Array = pack(ArgumentPointer);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 0);
+  EXPECT_EQ(1u, Array.getArgumentCount());
+  EXPECT_EQ(0u, Array.getSharedCount());
+}
+
+TEST_F(DeviceMemoryPackingTest, SingleSharedDeviceMemory) {
+  auto Array = pack(SharedDeviceMemory);
+  ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(),
+              Type::SHARED_DEVICE_MEMORY, Array, 0);
+  EXPECT_EQ(1u, Array.getArgumentCount());
+  EXPECT_EQ(1u, Array.getSharedCount());
+}
+
+TEST_F(DeviceMemoryPackingTest, SingleSharedDeviceMemoryPointer) {
+  auto Array = pack(&SharedDeviceMemory);
+  ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(),
+              Type::SHARED_DEVICE_MEMORY, Array, 0);
+  EXPECT_EQ(1u, Array.getArgumentCount());
+  EXPECT_EQ(1u, Array.getSharedCount());
+}
+
+TEST_F(DeviceMemoryPackingTest, SingleConstSharedDeviceMemoryPointer) {
+  const se::SharedDeviceMemory<int> *ArgumentPointer = &SharedDeviceMemory;
+  auto Array = pack(ArgumentPointer);
+  ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(),
+              Type::SHARED_DEVICE_MEMORY, Array, 0);
+  EXPECT_EQ(1u, Array.getArgumentCount());
+  EXPECT_EQ(1u, Array.getSharedCount());
+}
+
+TEST_F(DeviceMemoryPackingTest, PackSeveralArguments) {
+  const se::DeviceMemoryBase *BasePointer = &DeviceMemoryBase;
+  const se::DeviceMemory<int> *TypedPointer = &DeviceMemory;
+  const se::SharedDeviceMemory<int> *SharedPointer = &SharedDeviceMemory;
+  auto Array = pack(Value, DeviceMemoryBase, &DeviceMemoryBase, BasePointer,
+                    DeviceMemory, &DeviceMemory, TypedPointer,
+                    SharedDeviceMemory, &SharedDeviceMemory, SharedPointer);
+  ExpectEqual(&Value, sizeof(Value), Type::VALUE, Array, 0);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 1);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 2);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 3);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 4);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 5);
+  ExpectEqual(Handle, sizeof(void *), Type::GLOBAL_DEVICE_MEMORY, Array, 6);
+  ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(),
+              Type::SHARED_DEVICE_MEMORY, Array, 7);
+  ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(),
+              Type::SHARED_DEVICE_MEMORY, Array, 8);
+  ExpectEqual(nullptr, SharedDeviceMemory.getByteCount(),
+              Type::SHARED_DEVICE_MEMORY, Array, 9);
+  EXPECT_EQ(10u, Array.getArgumentCount());
+  EXPECT_EQ(3u, Array.getSharedCount());
+}
+
+} // namespace