Index: openmp/libomptarget/include/omptargetplugin.h =================================================================== --- openmp/libomptarget/include/omptargetplugin.h +++ openmp/libomptarget/include/omptargetplugin.h @@ -202,6 +202,13 @@ int32_t __tgt_rtl_init_device_info(int32_t ID, __tgt_device_info *DeviceInfoPtr, const char **ErrStr); +// lock/pin host memory +int32_t __tgt_rtl_data_lock(int32_t ID, void *HstPtr, int64_t Size, + void **LockedPtr); + +// unlock/unpin host memory +int32_t __tgt_rtl_data_unlock(int32_t ID, void *HstPtr); + #ifdef __cplusplus } #endif Index: openmp/libomptarget/include/rtl.h =================================================================== --- openmp/libomptarget/include/rtl.h +++ openmp/libomptarget/include/rtl.h @@ -76,6 +76,8 @@ typedef int32_t(init_async_info_ty)(int32_t, __tgt_async_info **); typedef int64_t(init_device_into_ty)(int64_t, __tgt_device_info *, const char **); + typedef int32_t(data_lock_ty)(int32_t, void *, int64_t, void **); + typedef int32_t(data_unlock_ty)(int32_t, void *); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -127,6 +129,8 @@ init_async_info_ty *init_async_info = nullptr; init_device_into_ty *init_device_info = nullptr; release_async_info_ty *release_async_info = nullptr; + data_lock_ty *data_lock = nullptr; + data_unlock_ty *data_unlock = nullptr; // Are there images associated with this RTL. bool IsUsed = false; Index: openmp/libomptarget/plugins/amdgpu/impl/impl.cpp =================================================================== --- openmp/libomptarget/plugins/amdgpu/impl/impl.cpp +++ openmp/libomptarget/plugins/amdgpu/impl/impl.cpp @@ -12,6 +12,39 @@ * Data */ +bool is_locked(void *ptr, hsa_status_t *err_p, void **agentBaseAddress) { + bool is_locked = false; + hsa_status_t err = HSA_STATUS_SUCCESS; + hsa_amd_pointer_info_t info; + info.size = sizeof(hsa_amd_pointer_info_t); + err = hsa_amd_pointer_info(ptr, &info, nullptr, nullptr, nullptr); + + if (err_p) + *err_p = err; + + if (err != HSA_STATUS_SUCCESS) { + DP("Error when getting pointer info\n"); + return false; + } + + is_locked = (info.type == HSA_EXT_POINTER_TYPE_LOCKED); + if (is_locked && agentBaseAddress != nullptr) { + // When user passes in a basePtr+offset we need to fix the + // locked pointer to include the offset: ROCr always returns + // the base locked address, not the shifted one. + if ((char *)info.hostBaseAddress <= (char *)ptr && + (char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes) + *agentBaseAddress = + (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr - + (uint64_t)info.hostBaseAddress); + else // address is already device-agent accessible, no need to compute + // offset + *agentBaseAddress = ptr; + } + + return is_locked; +} + // host pointer (either src or dest) must be locked via hsa_amd_memory_lock static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest, hsa_agent_t agent, const void *src, @@ -52,9 +85,17 @@ hsa_status_t err; void *lockedPtr = nullptr; - err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr); - if (err != HSA_STATUS_SUCCESS) - return err; + bool hostPtrIsLocked = is_locked(lockingPtr, &err, &lockedPtr); + if (!hostPtrIsLocked) { + if (err != HSA_STATUS_SUCCESS) + return err; + hsa_agent_t agents[1] = {agent}; + err = hsa_amd_memory_lock(lockingPtr, size, agents, 1, (void **)&lockedPtr); + if (err != HSA_STATUS_SUCCESS) + return err; + DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n", + lockingPtr, lockedPtr, size); + } switch (direction) { case H2D: @@ -71,7 +112,9 @@ return err; } - err = hsa_amd_memory_unlock(lockingPtr); + // unlock only if not user locked + if (!hostPtrIsLocked) + err = hsa_amd_memory_unlock(lockingPtr); if (err != HSA_STATUS_SUCCESS) return err; Index: openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h =================================================================== --- openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h +++ openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h @@ -12,6 +12,10 @@ extern "C" { +// Check if pointer ptr is already locked and return true +// if so. Return false otherwise. +bool is_locked(void *ptr, hsa_status_t *err_p, void **agentBaseAddress); + hsa_status_t impl_module_register_from_memory_to_place( void *module_bytes, size_t module_size, int DeviceId, hsa_status_t (*on_deserialized_data)(void *data, size_t size, Index: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1816,6 +1816,28 @@ return (Rc == 0) && (SI.Addr != nullptr); } +void *lock_memory(void *mem, size_t size) { + void *lockedPtr = nullptr; + hsa_status_t err = HSA_STATUS_SUCCESS; + + if (is_locked(mem, &err, nullptr)) + return mem; + + err = hsa_amd_memory_lock(mem, size, nullptr, /*num_agent=*/0, + (void **)&lockedPtr); + if (err != HSA_STATUS_SUCCESS) + return nullptr; + + return lockedPtr; +} + +hsa_status_t unlock_memory(void *mem) { + hsa_status_t err = HSA_STATUS_SUCCESS; + if (is_locked(mem, &err, nullptr)) + err = hsa_amd_memory_unlock(mem); + return err; +} + } // namespace namespace core { @@ -2589,4 +2611,35 @@ DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]); } +int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size, + void **LockedPtr) { + void *ptr = nullptr; + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); + + ptr = lock_memory(HostPtr, Size); + if (!ptr) + return OFFLOAD_FAIL; + + *LockedPtr = ptr; + DP("Tgt lock data %ld bytes, (tgt:%016llx).\n", Size, + (long long unsigned)(Elf64_Addr)ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) { + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); + hsa_status_t err = HSA_STATUS_SUCCESS; + + err = unlock_memory(HostPtr); + + if (err != HSA_STATUS_SUCCESS) { + DP("Error in tgt_rtl_data_unlock\n"); + return OFFLOAD_FAIL; + } + + DP("Tgt unlock data (tgt:%016llx).\n", + (long long unsigned)(Elf64_Addr)HostPtr); + return OFFLOAD_SUCCESS; +} + } // extern "C" Index: openmp/libomptarget/src/api.cpp =================================================================== --- openmp/libomptarget/src/api.cpp +++ openmp/libomptarget/src/api.cpp @@ -81,6 +81,15 @@ EXTERN void *llvm_omp_target_dynamic_shared_alloc() { return nullptr; } EXTERN void *llvm_omp_get_dynamic_shared() { return nullptr; } +EXTERN [[nodiscard]] void *llvm_omp_target_lock_mem(void *Ptr, size_t Size, + int DeviceNum) { + return targetLockExplicit(Ptr, Size, DeviceNum, __func__); +} + +EXTERN void llvm_omp_target_unlock_mem(void *Ptr, int DeviceNum) { + targetUnlockExplicit(Ptr, DeviceNum, __func__); +} + EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) { TIMESCOPE(); DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n", Index: openmp/libomptarget/src/exports =================================================================== --- openmp/libomptarget/src/exports +++ openmp/libomptarget/src/exports @@ -48,6 +48,8 @@ llvm_omp_target_free_shared; llvm_omp_target_free_device; llvm_omp_target_dynamic_shared_alloc; + llvm_omp_target_lock_mem; + llvm_omp_target_unlock_mem; __tgt_set_info_flag; __tgt_print_device_info; omp_get_interop_ptr; Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -425,6 +425,80 @@ DP("omp_target_free deallocated device ptr\n"); } +void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, + const char *Name) { + TIMESCOPE(); + DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size); + + if (Size <= 0) { + DP("Call to %s with non-positive length\n", Name); + return NULL; + } + + void *rc = NULL; + + if (!deviceIsReady(DeviceNum)) { + DP("%s returns NULL ptr\n", Name); + return NULL; + } + + DeviceTy *DevicePtr = nullptr; + { + std::lock_guardRTLsMtx)> LG(PM->RTLsMtx); + + if (!PM->Devices[DeviceNum]) { + DP("%s returns, device %d not available\n", Name, DeviceNum); + return nullptr; + } + + DevicePtr = PM->Devices[DeviceNum].get(); + } + + int32_t err = 0; + if (DevicePtr->RTL->data_lock) { + err = DevicePtr->RTL->data_lock(DeviceNum, HostPtr, Size, &rc); + if (err) { + DP("Could not lock ptr %p\n", HostPtr); + return nullptr; + } + } + DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(rc)); + return rc; +} + +void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { + TIMESCOPE(); + DP("Call to %s for device %d unlocking\n", Name, DeviceNum); + + DeviceTy *DevicePtr = nullptr; + { + std::lock_guardRTLsMtx)> LG(PM->RTLsMtx); + + // Don't check deviceIsReady as it can initialize the device if needed. + // Just check if DeviceNum exists as targetUnlockExplicit can be called + // during process exit/free (and it may have been already destroyed) and + // targetAllocExplicit will have already checked deviceIsReady anyway. + size_t DevicesSize = PM->Devices.size(); + + if (DevicesSize <= (size_t)DeviceNum) { + DP("Device ID %d does not have a matching RTL\n", DeviceNum); + return; + } + + if (!PM->Devices[DeviceNum]) { + DP("%s returns, device %d not available\n", Name, DeviceNum); + return; + } + + DevicePtr = PM->Devices[DeviceNum].get(); + } // unlock RTLsMtx + + if (DevicePtr->RTL->data_unlock) + DevicePtr->RTL->data_unlock(DeviceNum, HostPtr); + + DP("%s returns\n", Name); +} + /// Call the user-defined mapper function followed by the appropriate // targetData* function (targetData{Begin,End,Update}). int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, Index: openmp/libomptarget/src/private.h =================================================================== --- openmp/libomptarget/src/private.h +++ openmp/libomptarget/src/private.h @@ -51,6 +51,10 @@ const char *Name); extern void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, const char *Name); +extern void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, + const char *Name); +extern void targetUnlockExplicit(void *HostPtr, int DeviceNum, + const char *Name); // This structure stores information of a mapped memory region. struct MapComponentInfoTy { Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -246,6 +246,10 @@ DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info"); *((void **)&RTL.init_device_info) = DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info"); + *((void **)&RTL.data_lock) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_lock"); + *((void **)&RTL.data_unlock) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_unlock"); RTL.LibraryHandler = std::move(DynLibrary); Index: openmp/libomptarget/test/mapping/prelock.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/test/mapping/prelock.cpp @@ -0,0 +1,65 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: nvptx64-nvidia-cuda +// UNSUPPORTED: nvptx64-nvidia-cuda-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include + +#include + +extern "C" { +void *llvm_omp_target_lock_mem(void *ptr, size_t size, int device_num); +void llvm_omp_target_unlock_mem(void *ptr, int device_num); +} + +int main() { + int n = 100; + int *unlocked = new int[n]; + + for (int i = 0; i < n; i++) + unlocked[i] = i; + + int *locked = (int *)llvm_omp_target_lock_mem(unlocked, n * sizeof(int), + omp_get_default_device()); + +#pragma omp target teams distribute parallel for map(tofrom : unlocked[ : n]) + for (int i = 0; i < n; i++) + unlocked[i] += 1; + +#pragma omp target teams distribute parallel for map(tofrom : unlocked[10 : 10]) + for (int i = 10; i < 20; i++) + unlocked[i] += 1; + +#pragma omp target teams distribute parallel for map(tofrom : locked[ : n]) + for (int i = 0; i < n; i++) + locked[i] += 1; + +#pragma omp target teams distribute parallel for map(tofrom : locked[10 : 10]) + for (int i = 10; i < 20; i++) + locked[i] += 1; + + llvm_omp_target_unlock_mem(unlocked, omp_get_default_device()); + + int err = 0; + for (int i = 0; i < n; i++) { + if (i < 10 || i > 19) { + if (unlocked[i] != i + 2) { + printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 1); + err++; + } + } else if (unlocked[i] != i + 4) { + printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 2); + err++; + } + } + + // CHECK: PASS + if (err == 0) + printf("PASS\n"); + + return err; +} Index: openmp/runtime/src/kmp_alloc.cpp =================================================================== --- openmp/runtime/src/kmp_alloc.cpp +++ openmp/runtime/src/kmp_alloc.cpp @@ -1371,6 +1371,9 @@ kmp_target_alloc_host && kmp_target_alloc_shared && kmp_target_alloc_device && kmp_target_free_host && kmp_target_free_shared && kmp_target_free_device; + // lock/pin and unlock/unpin target calls + *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem"); + *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem"); } omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,