diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -202,6 +202,13 @@ int32_t __tgt_rtl_init_device_info(int32_t ID, __tgt_device_info *DeviceInfoPtr, const char **ErrStr); +// lock/pin host memory +int32_t __tgt_rtl_data_lock(int32_t ID, void *HstPtr, int64_t Size, + void **LockedPtr); + +// unlock/unpin host memory +int32_t __tgt_rtl_data_unlock(int32_t ID, void *HstPtr); + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h --- a/openmp/libomptarget/include/rtl.h +++ b/openmp/libomptarget/include/rtl.h @@ -76,6 +76,8 @@ typedef int32_t(init_async_info_ty)(int32_t, __tgt_async_info **); typedef int64_t(init_device_into_ty)(int64_t, __tgt_device_info *, const char **); + typedef int32_t(data_lock_ty)(int32_t, void *, int64_t, void **); + typedef int32_t(data_unlock_ty)(int32_t, void *); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -127,6 +129,8 @@ init_async_info_ty *init_async_info = nullptr; init_device_into_ty *init_device_info = nullptr; release_async_info_ty *release_async_info = nullptr; + data_lock_ty *data_lock = nullptr; + data_unlock_ty *data_unlock = nullptr; // Are there images associated with this RTL. bool IsUsed = false; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp @@ -12,6 +12,36 @@ * Data */ +hsa_status_t is_locked(void *ptr, void **agentBaseAddress) { + hsa_status_t err = HSA_STATUS_SUCCESS; + hsa_amd_pointer_info_t info; + info.size = sizeof(hsa_amd_pointer_info_t); + err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr, + /*num_agents_accessible=*/nullptr, + /*accessible=*/nullptr); + if (err != HSA_STATUS_SUCCESS) { + DP("Error when getting pointer info\n"); + return err; + } + + if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) { + // When user passes in a basePtr+offset we need to fix the + // locked pointer to include the offset: ROCr always returns + // the base locked address, not the shifted one. + if ((char *)info.hostBaseAddress <= (char *)ptr && + (char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes) + *agentBaseAddress = + (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr - + (uint64_t)info.hostBaseAddress); + else // address is already device-agent accessible, no need to compute + // offset + *agentBaseAddress = ptr; + } else + *agentBaseAddress = nullptr; + + return HSA_STATUS_SUCCESS; +} + // host pointer (either src or dest) must be locked via hsa_amd_memory_lock static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest, hsa_agent_t agent, const void *src, @@ -49,12 +79,21 @@ hsa_signal_t signal, void *dest, hsa_agent_t agent, void *src, void *lockingPtr, size_t size) { - hsa_status_t err; - void *lockedPtr = nullptr; - err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr); + hsa_status_t err = is_locked(lockingPtr, &lockedPtr); + bool HostPtrIsLocked = true; if (err != HSA_STATUS_SUCCESS) return err; + if (!lockedPtr) { // not locked + HostPtrIsLocked = false; + hsa_agent_t agents[1] = {agent}; + err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1, + (void **)&lockedPtr); + if (err != HSA_STATUS_SUCCESS) + return err; + DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n", + lockingPtr, lockedPtr, size); + } switch (direction) { case H2D: @@ -65,13 +104,16 @@ break; } - if (err != HSA_STATUS_SUCCESS) { + if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) { // do not leak locked host pointers, but discard potential error message + // because the initial error was in the copy function hsa_amd_memory_unlock(lockingPtr); return err; } - err = hsa_amd_memory_unlock(lockingPtr); + // unlock only if not user locked + if (!HostPtrIsLocked) + err = hsa_amd_memory_unlock(lockingPtr); if (err != HSA_STATUS_SUCCESS) return err; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h --- a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h @@ -12,6 +12,9 @@ extern "C" { +// Check if pointer ptr is already locked +hsa_status_t is_locked(void *ptr, void **agentBaseAddress); + hsa_status_t impl_module_register_from_memory_to_place( void *module_bytes, size_t module_size, int DeviceId, hsa_status_t (*on_deserialized_data)(void *data, size_t size, diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1816,6 +1816,35 @@ return (Rc == 0) && (SI.Addr != nullptr); } +hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent, + void **LockedHostPtr) { + hsa_status_t err = is_locked(HostPtr, LockedHostPtr); + if (err != HSA_STATUS_SUCCESS) + return err; + + // HostPtr is already locked, just return it + if (*LockedHostPtr) + return HSA_STATUS_SUCCESS; + + hsa_agent_t Agents[1] = {Agent}; + return hsa_amd_memory_lock(HostPtr, Size, Agents, /*num_agent=*/1, + LockedHostPtr); +} + +hsa_status_t unlock_memory(void *HostPtr) { + void *LockedHostPtr = nullptr; + hsa_status_t err = is_locked(HostPtr, &LockedHostPtr); + if (err != HSA_STATUS_SUCCESS) + return err; + + // if LockedHostPtr is nullptr, then HostPtr was not locked + if (!LockedHostPtr) + return HSA_STATUS_SUCCESS; + + err = hsa_amd_memory_unlock(HostPtr); + return err; +} + } // namespace namespace core { @@ -2589,4 +2618,32 @@ DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]); } +int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size, + void **LockedHostPtr) { + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); + + hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; + hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr); + if (err != HSA_STATUS_SUCCESS) { + DP("Error in tgt_rtl_data_lock\n"); + return OFFLOAD_FAIL; + } + DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size, + (long long unsigned)(Elf64_Addr)*LockedHostPtr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) { + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); + hsa_status_t err = unlock_memory(HostPtr); + if (err != HSA_STATUS_SUCCESS) { + DP("Error in tgt_rtl_data_unlock\n"); + return OFFLOAD_FAIL; + } + + DP("Tgt unlock data (tgt:%016llx).\n", + (long long unsigned)(Elf64_Addr)HostPtr); + return OFFLOAD_SUCCESS; +} + } // extern "C" diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -82,6 +82,15 @@ EXTERN void *llvm_omp_target_dynamic_shared_alloc() { return nullptr; } EXTERN void *llvm_omp_get_dynamic_shared() { return nullptr; } +EXTERN [[nodiscard]] void *llvm_omp_target_lock_mem(void *Ptr, size_t Size, + int DeviceNum) { + return targetLockExplicit(Ptr, Size, DeviceNum, __func__); +} + +EXTERN void llvm_omp_target_unlock_mem(void *Ptr, int DeviceNum) { + targetUnlockExplicit(Ptr, DeviceNum, __func__); +} + EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) { TIMESCOPE(); DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n", diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -49,6 +49,8 @@ llvm_omp_target_free_shared; llvm_omp_target_free_device; llvm_omp_target_dynamic_shared_alloc; + llvm_omp_target_lock_mem; + llvm_omp_target_unlock_mem; __tgt_set_info_flag; __tgt_print_device_info; omp_get_interop_ptr; diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -425,6 +425,80 @@ DP("omp_target_free deallocated device ptr\n"); } +void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, + const char *Name) { + TIMESCOPE(); + DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size); + + if (Size <= 0) { + DP("Call to %s with non-positive length\n", Name); + return NULL; + } + + void *rc = NULL; + + if (!deviceIsReady(DeviceNum)) { + DP("%s returns NULL ptr\n", Name); + return NULL; + } + + DeviceTy *DevicePtr = nullptr; + { + std::lock_guardRTLsMtx)> LG(PM->RTLsMtx); + + if (!PM->Devices[DeviceNum]) { + DP("%s returns, device %d not available\n", Name, DeviceNum); + return nullptr; + } + + DevicePtr = PM->Devices[DeviceNum].get(); + } + + int32_t err = 0; + if (DevicePtr->RTL->data_lock) { + err = DevicePtr->RTL->data_lock(DeviceNum, HostPtr, Size, &rc); + if (err) { + DP("Could not lock ptr %p\n", HostPtr); + return nullptr; + } + } + DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(rc)); + return rc; +} + +void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { + TIMESCOPE(); + DP("Call to %s for device %d unlocking\n", Name, DeviceNum); + + DeviceTy *DevicePtr = nullptr; + { + std::lock_guardRTLsMtx)> LG(PM->RTLsMtx); + + // Don't check deviceIsReady as it can initialize the device if needed. + // Just check if DeviceNum exists as targetUnlockExplicit can be called + // during process exit/free (and it may have been already destroyed) and + // targetAllocExplicit will have already checked deviceIsReady anyway. + size_t DevicesSize = PM->Devices.size(); + + if (DevicesSize <= (size_t)DeviceNum) { + DP("Device ID %d does not have a matching RTL\n", DeviceNum); + return; + } + + if (!PM->Devices[DeviceNum]) { + DP("%s returns, device %d not available\n", Name, DeviceNum); + return; + } + + DevicePtr = PM->Devices[DeviceNum].get(); + } // unlock RTLsMtx + + if (DevicePtr->RTL->data_unlock) + DevicePtr->RTL->data_unlock(DeviceNum, HostPtr); + + DP("%s returns\n", Name); +} + /// Call the user-defined mapper function followed by the appropriate // targetData* function (targetData{Begin,End,Update}). int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -51,6 +51,10 @@ const char *Name); extern void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, const char *Name); +extern void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, + const char *Name); +extern void targetUnlockExplicit(void *HostPtr, int DeviceNum, + const char *Name); // This structure stores information of a mapped memory region. struct MapComponentInfoTy { diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -246,6 +246,10 @@ DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info"); *((void **)&RTL.init_device_info) = DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info"); + *((void **)&RTL.data_lock) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_lock"); + *((void **)&RTL.data_unlock) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_unlock"); RTL.LibraryHandler = std::move(DynLibrary); diff --git a/openmp/libomptarget/test/mapping/prelock.cpp b/openmp/libomptarget/test/mapping/prelock.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/mapping/prelock.cpp @@ -0,0 +1,67 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: nvptx64-nvidia-cuda +// UNSUPPORTED: nvptx64-nvidia-cuda-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include + +#include + +extern "C" { +void *llvm_omp_target_lock_mem(void *ptr, size_t size, int device_num); +void llvm_omp_target_unlock_mem(void *ptr, int device_num); +} + +int main() { + int n = 100; + int *unlocked = new int[n]; + + for (int i = 0; i < n; i++) + unlocked[i] = i; + + int *locked = (int *)llvm_omp_target_lock_mem(unlocked, n * sizeof(int), + omp_get_default_device()); + if (!locked) + return 0; + +#pragma omp target teams distribute parallel for map(tofrom : unlocked[ : n]) + for (int i = 0; i < n; i++) + unlocked[i] += 1; + +#pragma omp target teams distribute parallel for map(tofrom : unlocked[10 : 10]) + for (int i = 10; i < 20; i++) + unlocked[i] += 1; + +#pragma omp target teams distribute parallel for map(tofrom : locked[ : n]) + for (int i = 0; i < n; i++) + locked[i] += 1; + +#pragma omp target teams distribute parallel for map(tofrom : locked[10 : 10]) + for (int i = 10; i < 20; i++) + locked[i] += 1; + + llvm_omp_target_unlock_mem(unlocked, omp_get_default_device()); + + int err = 0; + for (int i = 0; i < n; i++) { + if (i < 10 || i > 19) { + if (unlocked[i] != i + 2) { + printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 1); + err++; + } + } else if (unlocked[i] != i + 4) { + printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 2); + err++; + } + } + + // CHECK: PASS + if (err == 0) + printf("PASS\n"); + + return err; +} diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp --- a/openmp/runtime/src/kmp_alloc.cpp +++ b/openmp/runtime/src/kmp_alloc.cpp @@ -1371,6 +1371,9 @@ kmp_target_alloc_host && kmp_target_alloc_shared && kmp_target_alloc_device && kmp_target_free_host && kmp_target_free_shared && kmp_target_free_device; + // lock/pin and unlock/unpin target calls + *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem"); + *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem"); } omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,