diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h --- a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h +++ b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h @@ -31,7 +31,7 @@ /// Allocate a memory of size \p Size . \p HstPtr is used to assist the /// allocation. - virtual void *allocate(size_t Size, void *HstPtr) = 0; + virtual void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) = 0; /// Delete the pointer \p TgtPtr on the device virtual int free(void *TgtPtr) = 0; @@ -133,7 +133,7 @@ /// Request memory from target device void *allocateOnDevice(size_t Size, void *HstPtr) const { - return DeviceAllocator.allocate(Size, HstPtr); + return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE); } /// Deallocate data on device diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "Debug.h" @@ -297,12 +298,13 @@ class CUDADeviceAllocatorTy : public DeviceAllocatorTy { const int DeviceId; const std::vector &DeviceData; + std::unordered_map HostPinnedAllocs; public: CUDADeviceAllocatorTy(int DeviceId, std::vector &DeviceData) : DeviceId(DeviceId), DeviceData(DeviceData) {} - void *allocate(size_t Size, void *) override { + void *allocate(size_t Size, void *, TargetAllocTy Kind) override { if (Size == 0) return nullptr; @@ -310,12 +312,34 @@ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return nullptr; - CUdeviceptr DevicePtr; - Err = cuMemAlloc(&DevicePtr, Size); - if (!checkResult(Err, "Error returned from cuMemAlloc\n")) - return nullptr; + void *MemAlloc = nullptr; + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + CUdeviceptr DevicePtr; + Err = cuMemAlloc(&DevicePtr, Size); + MemAlloc = (void *)DevicePtr; + if (!checkResult(Err, "Error returned from cuMemAlloc\n")) + return nullptr; + break; + case TARGET_ALLOC_HOST: + void *HostPtr; + Err = cuMemAllocHost(&HostPtr, Size); + MemAlloc = HostPtr; + if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) + return nullptr; + HostPinnedAllocs[MemAlloc] = Kind; + break; + case TARGET_ALLOC_SHARED: + CUdeviceptr SharedPtr; + Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); + MemAlloc = (void *)SharedPtr; + if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) + return nullptr; + break; + } - return (void *)DevicePtr; + return MemAlloc; } int free(void *TgtPtr) override { @@ -323,9 +347,25 @@ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; - Err = cuMemFree((CUdeviceptr)TgtPtr); - if (!checkResult(Err, "Error returned from cuMemFree\n")) - return OFFLOAD_FAIL; + // Host pinned memory must be freed differently. + TargetAllocTy Kind = + (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) + ? TARGET_ALLOC_DEFAULT + : TARGET_ALLOC_HOST; + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_SHARED: + Err = cuMemFree((CUdeviceptr)TgtPtr); + if (!checkResult(Err, "Error returned from cuMemFree\n")) + return OFFLOAD_FAIL; + break; + case TARGET_ALLOC_HOST: + Err = cuMemFreeHost(TgtPtr); + if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) + return OFFLOAD_FAIL; + break; + } return OFFLOAD_SUCCESS; } @@ -804,11 +844,24 @@ return getOffloadEntriesTable(DeviceId); } - void *dataAlloc(const int DeviceId, const int64_t Size) { - if (UseMemoryManager) - return MemoryManagers[DeviceId]->allocate(Size, nullptr); + void *dataAlloc(const int DeviceId, const int64_t Size, + const TargetAllocTy Kind) { + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + if (UseMemoryManager) + return MemoryManagers[DeviceId]->allocate(Size, nullptr); + else + return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); + case TARGET_ALLOC_HOST: + case TARGET_ALLOC_SHARED: + return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); + } - return DeviceAllocators[DeviceId].allocate(Size, nullptr); + REPORT("Invalid target data allocation kind or requested allocator not " + "implemented yet\n"); + + return nullptr; } int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, @@ -1097,13 +1150,7 @@ int32_t kind) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - if (kind != TARGET_ALLOC_DEFAULT) { - REPORT("Invalid target data allocation kind or requested allocator not " - "implemented yet\n"); - return NULL; - } - - return DeviceRTL.dataAlloc(device_id, size); + return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); } int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, diff --git a/openmp/libomptarget/test/api/omp_device_managed_memory.c b/openmp/libomptarget/test/api/omp_device_managed_memory.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/api/omp_device_managed_memory.c @@ -0,0 +1,29 @@ +// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda +// REQUIRES: nvptx64-nvidia-cuda + +#include +#include + +void *llvm_omp_target_alloc_shared(size_t, int); + +int main() { + const int N = 64; + const int device = omp_get_default_device(); + + int *shared_ptr = llvm_omp_target_alloc_shared(N * sizeof(int), device); + +#pragma omp target teams distribute parallel for device(device) \ + is_device_ptr(shared_ptr) + for (int i = 0; i < N; ++i) { + shared_ptr[i] = 1; + } + + int sum = 0; + for (int i = 0; i < N; ++i) + sum += shared_ptr[i]; + + omp_target_free(shared_ptr, device); + // CHECK: PASS + if (sum == N) + printf ("PASS\n"); +} diff --git a/openmp/libomptarget/test/api/omp_host_pinned_memory.c b/openmp/libomptarget/test/api/omp_host_pinned_memory.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/api/omp_host_pinned_memory.c @@ -0,0 +1,33 @@ +// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda +// REQUIRES: nvptx64-nvidia-cuda + +#include +#include + +// Allocate pinned memory on the host +void *llvm_omp_target_alloc_host(size_t, int); + +int main() { + const int N = 64; + const int device = omp_get_default_device(); + const int host = omp_get_initial_device(); + + int *hst_ptr = llvm_omp_target_alloc_host(N * sizeof(int), device); + + for (int i = 0; i < N; ++i) + hst_ptr[i] = 2; + +#pragma omp target teams distribute parallel for device(device) \ + map(tofrom:hst_ptr[0 : N]) + for (int i = 0; i < N; ++i) + hst_ptr[i] -= 1; + + int sum = 0; + for (int i = 0; i < N; ++i) + sum += hst_ptr[i]; + + omp_target_free(hst_ptr, device); + // CHECK: PASS + if (sum == N) + printf ("PASS\n"); +}