diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h --- a/openmp/libomptarget/include/device.h +++ b/openmp/libomptarget/include/device.h @@ -411,8 +411,9 @@ void *allocData(int64_t Size, void *HstPtr = nullptr, int32_t Kind = TARGET_ALLOC_DEFAULT); /// Deallocates memory which \p TgtPtrBegin points at and returns - /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. - int32_t deleteData(void *TgtPtrBegin); + /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. p Kind dictates what + /// allocator should be used (host, shared, device). + int32_t deleteData(void *TgtPtrBegin, int32_t Kind = TARGET_ALLOC_DEFAULT); // Data transfer. When AsyncInfo is nullptr, the transfer will be // synchronous. diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -247,6 +247,12 @@ void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum); void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +/// Explicit target memory deallocators +/// Using the llvm_ prefix until they become part of the OpenMP standard. +void llvm_omp_target_free_device(void *DevicePtr, int DeviceNum); +void llvm_omp_target_free_host(void *DevicePtr, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); + /// Dummy target so we have a symbol for generating host fallback. void *llvm_omp_target_dynamic_shared_alloc(); diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -117,8 +117,9 @@ __tgt_async_info *AsyncInfo); // De-allocate the data referenced by target ptr on the device. In case of -// success, return zero. Otherwise, return an error code. -int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); +// success, return zero. Otherwise, return an error code. Kind dictates what +// allocator to use (e.g. shared, host, device). +int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr, int32_t Kind); // Transfer control to the offloaded entry Entry on the target device. // Args and Offsets are arrays of NumArgs size of target addresses and diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h --- a/openmp/libomptarget/include/rtl.h +++ b/openmp/libomptarget/include/rtl.h @@ -49,7 +49,7 @@ typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t); typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *, int64_t, __tgt_async_info *); - typedef int32_t(data_delete_ty)(int32_t, void *); + typedef int32_t(data_delete_ty)(int32_t, void *, int32_t); typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t); typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *, diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -2606,7 +2606,7 @@ return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); } -int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) { +int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) { assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); // HSA can free pointers allocated from different types of memory pool. hsa_status_t Err; diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h --- a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h +++ b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h @@ -31,10 +31,11 @@ /// Allocate a memory of size \p Size . \p HstPtr is used to assist the /// allocation. - virtual void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) = 0; + virtual void *allocate(size_t Size, void *HstPtr, + TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; /// Delete the pointer \p TgtPtr on the device - virtual int free(void *TgtPtr) = 0; + virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; }; /// Class of memory manager. The memory manager is per-device by using diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include "Debug.h" @@ -366,8 +365,6 @@ /// A class responsible for interacting with device native runtime library to /// allocate and free memory. class CUDADeviceAllocatorTy : public DeviceAllocatorTy { - std::unordered_map HostPinnedAllocs; - public: void *allocate(size_t Size, void *, TargetAllocTy Kind) override { if (Size == 0) @@ -390,7 +387,6 @@ MemAlloc = HostPtr; if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) return nullptr; - HostPinnedAllocs[MemAlloc] = Kind; break; case TARGET_ALLOC_SHARED: CUdeviceptr SharedPtr; @@ -404,13 +400,9 @@ return MemAlloc; } - int free(void *TgtPtr) override { + int free(void *TgtPtr, TargetAllocTy Kind) override { CUresult Err; // Host pinned memory must be freed differently. - TargetAllocTy Kind = - (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) - ? TARGET_ALLOC_DEFAULT - : TARGET_ALLOC_HOST; switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: @@ -1102,11 +1094,23 @@ return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); } - int dataDelete(const int DeviceId, void *TgtPtr) { - if (UseMemoryManager) - return MemoryManagers[DeviceId]->free(TgtPtr); + int dataDelete(const int DeviceId, void *TgtPtr, TargetAllocTy Kind) { + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + if (UseMemoryManager) + return MemoryManagers[DeviceId]->free(TgtPtr); + else + return DeviceAllocators[DeviceId].free(TgtPtr, Kind); + case TARGET_ALLOC_HOST: + case TARGET_ALLOC_SHARED: + return DeviceAllocators[DeviceId].free(TgtPtr, Kind); + } - return DeviceAllocators[DeviceId].free(TgtPtr); + REPORT("Invalid target data allocation kind or requested allocator not " + "implemented yet\n"); + + return OFFLOAD_FAIL; } int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, @@ -1699,13 +1703,13 @@ return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo); } -int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) { +int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) { assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; - return DeviceRTL.dataDelete(DeviceId, TgtPtr); + return DeviceRTL.dataDelete(DeviceId, TgtPtr, (TargetAllocTy)Kind); } int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp --- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -232,7 +232,7 @@ return OFFLOAD_SUCCESS; } -int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) { +int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t) { free(TgtPtr); return OFFLOAD_SUCCESS; } diff --git a/openmp/libomptarget/plugins/remote/src/rtl.cpp b/openmp/libomptarget/plugins/remote/src/rtl.cpp --- a/openmp/libomptarget/plugins/remote/src/rtl.cpp +++ b/openmp/libomptarget/plugins/remote/src/rtl.cpp @@ -93,7 +93,7 @@ return Manager->dataRetrieve(DeviceId, HstPtr, TgtPtr, Size); } -int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) { +int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t) { return Manager->dataDelete(DeviceId, TgtPtr); } diff --git a/openmp/libomptarget/plugins/ve/src/rtl.cpp b/openmp/libomptarget/plugins/ve/src/rtl.cpp --- a/openmp/libomptarget/plugins/ve/src/rtl.cpp +++ b/openmp/libomptarget/plugins/ve/src/rtl.cpp @@ -392,7 +392,7 @@ // De-allocate the data referenced by target ptr on the device. In case of // success, return zero. Otherwise, return an error code. -int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr) { +int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr, int32_t) { int ret = veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr); if (ret != 0) { diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -62,34 +62,25 @@ return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_SHARED, __func__); } -EXTERN void *llvm_omp_target_dynamic_shared_alloc() { return nullptr; } -EXTERN void *llvm_omp_get_dynamic_shared() { return nullptr; } - -EXTERN void omp_target_free(void *DevicePtr, int DeviceNum) { - TIMESCOPE(); - DP("Call to omp_target_free for device %d and address " DPxMOD "\n", - DeviceNum, DPxPTR(DevicePtr)); - - if (!DevicePtr) { - DP("Call to omp_target_free with NULL ptr\n"); - return; - } +EXTERN void omp_target_free(void *Ptr, int DeviceNum) { + return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__); +} - if (DeviceNum == omp_get_initial_device()) { - free(DevicePtr); - DP("omp_target_free deallocated host ptr\n"); - return; - } +EXTERN void llvm_omp_target_free_device(void *Ptr, int DeviceNum) { + return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEVICE, __func__); +} - if (!deviceIsReady(DeviceNum)) { - DP("omp_target_free returns, nothing to do\n"); - return; - } +EXTERN void llvm_omp_target_free_host(void *Ptr, int DeviceNum) { + return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_HOST, __func__); +} - PM->Devices[DeviceNum]->deleteData(DevicePtr); - DP("omp_target_free deallocated device ptr\n"); +EXTERN void llvm_omp_target_free_shared(void *Ptre, int DeviceNum) { + return targetFreeExplicit(Ptre, DeviceNum, TARGET_ALLOC_SHARED, __func__); } +EXTERN void *llvm_omp_target_dynamic_shared_alloc() { return nullptr; } +EXTERN void *llvm_omp_get_dynamic_shared() { return nullptr; } + EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) { TIMESCOPE(); DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n", diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -530,8 +530,8 @@ return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind); } -int32_t DeviceTy::deleteData(void *TgtPtrBegin) { - return RTL->data_delete(RTLDeviceID, TgtPtrBegin); +int32_t DeviceTy::deleteData(void *TgtPtrBegin, int32_t Kind) { + return RTL->data_delete(RTLDeviceID, TgtPtrBegin, Kind); } // Submit data to device diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -43,6 +43,9 @@ llvm_omp_target_alloc_host; llvm_omp_target_alloc_shared; llvm_omp_target_alloc_device; + llvm_omp_target_free_host; + llvm_omp_target_free_shared; + llvm_omp_target_free_device; llvm_omp_target_dynamic_shared_alloc; __tgt_set_info_flag; __tgt_print_device_info; diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -368,6 +368,32 @@ return Rc; } +void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, + const char *Name) { + TIMESCOPE(); + DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum, + DPxPTR(DevicePtr)); + + if (!DevicePtr) { + DP("Call to %s with NULL ptr\n", Name); + return; + } + + if (DeviceNum == omp_get_initial_device()) { + free(DevicePtr); + DP("%s deallocated host ptr\n", Name); + return; + } + + if (!deviceIsReady(DeviceNum)) { + DP("%s returns, nothing to do\n", Name); + return; + } + + PM->Devices[DeviceNum]->deleteData(DevicePtr, Kind); + DP("omp_target_free deallocated device ptr\n"); +} + /// Call the user-defined mapper function followed by the appropriate // targetData* function (targetData{Begin,End,Update}). int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -49,6 +49,8 @@ extern bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc); extern void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, const char *Name); +extern void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, + const char *Name); // This structure stores information of a mapped memory region. struct MapComponentInfoTy { diff --git a/openmp/libomptarget/test/api/omp_device_managed_memory.c b/openmp/libomptarget/test/api/omp_device_managed_memory.c --- a/openmp/libomptarget/test/api/omp_device_managed_memory.c +++ b/openmp/libomptarget/test/api/omp_device_managed_memory.c @@ -5,6 +5,7 @@ #include void *llvm_omp_target_alloc_shared(size_t, int); +void llvm_omp_target_free_shared(void *, int); int main() { const int N = 64; @@ -12,8 +13,8 @@ int *shared_ptr = llvm_omp_target_alloc_shared(N * sizeof(int), device); -#pragma omp target teams distribute parallel for device(device) \ - is_device_ptr(shared_ptr) +#pragma omp target teams distribute parallel for device(device) \ + is_device_ptr(shared_ptr) for (int i = 0; i < N; ++i) { shared_ptr[i] = 1; } @@ -22,8 +23,8 @@ for (int i = 0; i < N; ++i) sum += shared_ptr[i]; - omp_target_free(shared_ptr, device); + llvm_omp_target_free_shared(shared_ptr, device); // CHECK: PASS if (sum == N) - printf ("PASS\n"); + printf("PASS\n"); } diff --git a/openmp/libomptarget/test/api/omp_host_pinned_memory.c b/openmp/libomptarget/test/api/omp_host_pinned_memory.c --- a/openmp/libomptarget/test/api/omp_host_pinned_memory.c +++ b/openmp/libomptarget/test/api/omp_host_pinned_memory.c @@ -5,6 +5,7 @@ // Allocate pinned memory on the host void *llvm_omp_target_alloc_host(size_t, int); +void llvm_omp_target_free_host(void *, int); int main() { const int N = 64; @@ -25,7 +26,7 @@ for (int i = 0; i < N; ++i) sum += hst_ptr[i]; - omp_target_free(hst_ptr, device); + llvm_omp_target_free_host(hst_ptr, device); // CHECK: PASS if (sum == N) printf ("PASS\n"); diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp --- a/openmp/runtime/src/kmp_alloc.cpp +++ b/openmp/runtime/src/kmp_alloc.cpp @@ -1245,7 +1245,9 @@ static void *(*kmp_target_alloc_host)(size_t size, int device); static void *(*kmp_target_alloc_shared)(size_t size, int device); static void *(*kmp_target_alloc_device)(size_t size, int device); -static void *(*kmp_target_free)(void *ptr, int device); +static void *(*kmp_target_free_host)(void *ptr, int device); +static void *(*kmp_target_free_shared)(void *ptr, int device); +static void *(*kmp_target_free_device)(void *ptr, int device); static bool __kmp_target_mem_available; #define KMP_IS_TARGET_MEM_SPACE(MS) \ (MS == llvm_omp_target_host_mem_space || \ @@ -1358,10 +1360,15 @@ KMP_DLSYM("llvm_omp_target_alloc_shared"); *(void **)(&kmp_target_alloc_device) = KMP_DLSYM("llvm_omp_target_alloc_device"); - *(void **)(&kmp_target_free) = KMP_DLSYM("omp_target_free"); - __kmp_target_mem_available = kmp_target_alloc_host && - kmp_target_alloc_shared && - kmp_target_alloc_device && kmp_target_free; + *(void **)(&kmp_target_free_host) = KMP_DLSYM("llvm_omp_target_free_host"); + *(void **)(&kmp_target_free_shared) = + KMP_DLSYM("llvm_omp_target_free_shared"); + *(void **)(&kmp_target_free_device) = + KMP_DLSYM("llvm_omp_target_free_device"); + __kmp_target_mem_available = + kmp_target_alloc_host && kmp_target_alloc_shared && + kmp_target_alloc_device && kmp_target_free_host && + kmp_target_free_shared && kmp_target_free_device; } omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms, @@ -1774,13 +1781,18 @@ kmp_mem_desc_t desc; kmp_uintptr_t addr_align; // address to return to caller kmp_uintptr_t addr_descr; // address of memory block descriptor - if (KMP_IS_TARGET_MEM_ALLOC(allocator) || - (allocator > kmp_max_mem_alloc && - KMP_IS_TARGET_MEM_SPACE(al->memspace))) { - KMP_DEBUG_ASSERT(kmp_target_free); + if (__kmp_target_mem_available && (KMP_IS_TARGET_MEM_ALLOC(allocator) || + (allocator > kmp_max_mem_alloc && + KMP_IS_TARGET_MEM_SPACE(al->memspace)))) { kmp_int32 device = __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device; - kmp_target_free(ptr, device); + if (allocator == llvm_omp_target_host_mem_alloc) { + kmp_target_free_host(ptr, device); + } else if (allocator == llvm_omp_target_shared_mem_alloc) { + kmp_target_free_shared(ptr, device); + } else if (allocator == llvm_omp_target_device_mem_alloc) { + kmp_target_free_device(ptr, device); + } return; }