Index: openmp/libomptarget/include/omptargetplugin.h =================================================================== --- openmp/libomptarget/include/omptargetplugin.h +++ openmp/libomptarget/include/omptargetplugin.h @@ -202,6 +202,12 @@ int32_t __tgt_rtl_init_device_info(int32_t ID, __tgt_device_info *DeviceInfoPtr, const char **ErrStr); +// lock/pin host memory +void *__tgt_rtl_data_lock(int DeviceId, void *TgtPtr, int64_t size); + +// unlock/unpin host memory +void __tgt_rtl_data_unlock(int DeviceId, void *TgtPtr); + #ifdef __cplusplus } #endif Index: openmp/libomptarget/include/rtl.h =================================================================== --- openmp/libomptarget/include/rtl.h +++ openmp/libomptarget/include/rtl.h @@ -76,6 +76,8 @@ typedef int32_t(init_async_info_ty)(int32_t, __tgt_async_info **); typedef int64_t(init_device_into_ty)(int64_t, __tgt_device_info *, const char **); + typedef void *(data_lock_ty)(int32_t, void *, int64_t); + typedef void *(data_unlock_ty)(int32_t, void *); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -127,6 +129,8 @@ init_async_info_ty *init_async_info = nullptr; init_device_into_ty *init_device_info = nullptr; release_async_info_ty *release_async_info = nullptr; + data_lock_ty *data_lock = nullptr; + data_unlock_ty *data_unlock = nullptr; // Are there images associated with this RTL. bool IsUsed = false; Index: openmp/libomptarget/plugins/amdgpu/impl/impl.cpp =================================================================== --- openmp/libomptarget/plugins/amdgpu/impl/impl.cpp +++ openmp/libomptarget/plugins/amdgpu/impl/impl.cpp @@ -12,6 +12,34 @@ * Data */ +bool is_locked(void *ptr, hsa_status_t *err_p, void **agentBaseAddress) { + bool is_locked = false; + hsa_status_t err = HSA_STATUS_SUCCESS; + hsa_amd_pointer_info_t info; + info.size = sizeof(hsa_amd_pointer_info_t); + err = hsa_amd_pointer_info(ptr, &info, nullptr, nullptr, nullptr); + + if (err_p) + *err_p = err; + + if (err != HSA_STATUS_SUCCESS) { + DP("Error when getting pointer info\n"); + return false; + } + + is_locked = (info.type == HSA_EXT_POINTER_TYPE_LOCKED); + if (is_locked && agentBaseAddress != nullptr) { + // When user passes in a basePtr+offset we need to fix the + // locked pointer to include the offset: ROCr always returns + // the base locked address, not the shifted one. + *agentBaseAddress = + (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr - + (uint64_t)info.hostBaseAddress); + } + + return is_locked; +} + // host pointer (either src or dest) must be locked via hsa_amd_memory_lock static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest, hsa_agent_t agent, const void *src, Index: openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h =================================================================== --- openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h +++ openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h @@ -12,6 +12,10 @@ extern "C" { +// Check if pointer ptr is already locked and return true +// if so. Return false otherwise. +bool is_locked(void *ptr, hsa_status_t *err_p, void **agentBaseAddress); + hsa_status_t impl_module_register_from_memory_to_place( void *module_bytes, size_t module_size, int DeviceId, hsa_status_t (*on_deserialized_data)(void *data, size_t size, Index: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1816,6 +1816,28 @@ return (Rc == 0) && (SI.Addr != nullptr); } +void *lock_memory(void *mem, size_t size) { + void *lockedPtr = nullptr; + hsa_status_t err = HSA_STATUS_SUCCESS; + + if (is_locked(mem, &err, nullptr)) + return mem; + + err = hsa_amd_memory_lock(mem, size, nullptr, /*num_agent=*/0, + (void **)&lockedPtr); + if (err != HSA_STATUS_SUCCESS) + return nullptr; + + return lockedPtr; +} + +hsa_status_t unlock_memory(void *mem) { + hsa_status_t err = HSA_STATUS_SUCCESS; + if (is_locked(mem, &err, nullptr)) + err = hsa_amd_memory_unlock(mem); + return err; +} + } // namespace namespace core { @@ -2589,4 +2611,30 @@ DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]); } +void *__tgt_rtl_data_lock(int DeviceId, void *TgtPtr, int64_t size) { + void *ptr = nullptr; + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); + + ptr = lock_memory(TgtPtr, size); + + if (ptr != nullptr) + DP("Tgt lock data %ld bytes, (tgt:%016llx).\n", size, + (long long unsigned)(Elf64_Addr)ptr); + + return ptr; +} + +void __tgt_rtl_data_unlock(int DeviceId, void *TgtPtr) { + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); + hsa_status_t err = HSA_STATUS_SUCCESS; + + err = unlock_memory(TgtPtr); + + if (err != HSA_STATUS_SUCCESS) + DP("Error in tgt_rtl_data_unlock\n"); + + DP("Tgt unlock data (tgt:%016llx).\n", + (long long unsigned)(Elf64_Addr)TgtPtr); +} + } // extern "C" Index: openmp/libomptarget/src/api.cpp =================================================================== --- openmp/libomptarget/src/api.cpp +++ openmp/libomptarget/src/api.cpp @@ -81,6 +81,14 @@ EXTERN void *llvm_omp_target_dynamic_shared_alloc() { return nullptr; } EXTERN void *llvm_omp_get_dynamic_shared() { return nullptr; } +EXTERN void *llvm_omp_target_lock_mem(void *ptr, size_t size, int device_num) { + return targetLockExplicit(ptr, size, device_num, __func__); +} + +EXTERN void llvm_omp_target_unlock_mem(void *ptr, int device_num) { + targetUnlockExplicit(ptr, device_num, __func__); +} + EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) { TIMESCOPE(); DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n", Index: openmp/libomptarget/src/exports =================================================================== --- openmp/libomptarget/src/exports +++ openmp/libomptarget/src/exports @@ -48,6 +48,8 @@ llvm_omp_target_free_shared; llvm_omp_target_free_device; llvm_omp_target_dynamic_shared_alloc; + llvm_omp_target_lock_mem; + llvm_omp_target_unlock_mem; __tgt_set_info_flag; __tgt_print_device_info; omp_get_interop_ptr; Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -425,6 +425,66 @@ DP("omp_target_free deallocated device ptr\n"); } +void *targetLockExplicit(void *ptr, size_t size, int DeviceNum, + const char *name) { + TIMESCOPE(); + DP("Call to %s for device %d locking %zu bytes\n", name, DeviceNum, size); + + if (size <= 0) { + DP("Call to %s with non-positive length\n", name); + return NULL; + } + + void *rc = NULL; + + if (!deviceIsReady(DeviceNum)) { + DP("%s returns NULL ptr\n", name); + return NULL; + } + + { + std::lock_guardRTLsMtx)> LG(PM->RTLsMtx); + + DeviceTy &Device = *PM->Devices[DeviceNum]; + if (Device.RTL->data_lock) + rc = Device.RTL->data_lock(DeviceNum, ptr, size); + } + + DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc)); + return rc; +} + +void targetUnlockExplicit(void *ptr, int DeviceNum, const char *name) { + TIMESCOPE(); + DP("Call to %s for device %d unlocking\n", name, DeviceNum); + { + std::lock_guardRTLsMtx)> LG(PM->RTLsMtx); + + // Don't check deviceIsReady as it can initialize the device if needed. + // Just check if DeviceNum exists as targetUnlockExplicit can be called + // during process exit/free (and it may have been already destroyed) and + // targetAllocExplicit will have already checked deviceIsReady anyway. + size_t DevicesSize = PM->Devices.size(); + + if (DevicesSize <= (size_t)DeviceNum) { + DP("Device ID %d does not have a matching RTL\n", DeviceNum); + return; + } + + if (!PM->Devices[DeviceNum]) { + DP("%s returns, device %d not available\n", name, DeviceNum); + return; + } + + DeviceTy &Device = *PM->Devices[DeviceNum]; + + if (Device.RTL->data_unlock) + Device.RTL->data_unlock(DeviceNum, ptr); + } // unlock RTLsMtx + + DP("%s returns\n", name); +} + /// Call the user-defined mapper function followed by the appropriate // targetData* function (targetData{Begin,End,Update}). int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, Index: openmp/libomptarget/src/private.h =================================================================== --- openmp/libomptarget/src/private.h +++ openmp/libomptarget/src/private.h @@ -51,6 +51,9 @@ const char *Name); extern void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, const char *Name); +extern void *targetLockExplicit(void *ptr, size_t size, int device_num, + const char *name); +extern void targetUnlockExplicit(void *ptr, int device_num, const char *name); // This structure stores information of a mapped memory region. struct MapComponentInfoTy { Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -246,6 +246,10 @@ DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info"); *((void **)&RTL.init_device_info) = DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info"); + *((void **)&RTL.data_lock) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_lock"); + *((void **)&RTL.data_unlock) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_unlock"); RTL.LibraryHandler = std::move(DynLibrary); Index: openmp/libomptarget/test/mapping/prelock.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/test/mapping/prelock.cpp @@ -0,0 +1,58 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: nvptx64-nvidia-cuda +// UNSUPPORTED: nvptx64-nvidia-cuda-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include + +#include + +extern "C" { +void *llvm_omp_target_lock_mem(void *ptr, size_t size, int device_num); +void llvm_omp_target_unlock_mem(void *ptr, int device_num); +} + +int main() { + int n = 100; + int *unlocked = new int[n]; + + for (int i = 0; i < n; i++) + unlocked[i] = i; + + int *locked = nullptr; + + llvm_omp_target_lock_mem(locked, n * sizeof(int), omp_get_default_device()); + +#pragma omp target teams distribute parallel for map(tofrom : unlocked[ : n]) + for (int i = 0; i < n; i++) + unlocked[i] += 1; + +#pragma omp target teams distribute parallel for map(tofrom : unlocked[10 : 10]) + for (int i = 10; i < 20; i++) + unlocked[i] += 1; + + llvm_omp_target_unlock_mem(locked, omp_get_default_device()); + + int err = 0; + for (int i = 0; i < n; i++) { + if (i < 10 || i > 19) { + if (unlocked[i] != i + 1) { + printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 1); + err++; + } + } else if (unlocked[i] != i + 2) { + printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 2); + err++; + } + } + + // CHECK: PASS + if (err == 0) + printf("PASS\n"); + + return err; +} Index: openmp/runtime/src/kmp_alloc.cpp =================================================================== --- openmp/runtime/src/kmp_alloc.cpp +++ openmp/runtime/src/kmp_alloc.cpp @@ -1371,6 +1371,9 @@ kmp_target_alloc_host && kmp_target_alloc_shared && kmp_target_alloc_device && kmp_target_free_host && kmp_target_free_shared && kmp_target_free_device; + // lock/pin and unlock/unpin target calls + *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem"); + *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem"); } omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,