diff --git a/openmp/libomptarget/src/CMakeLists.txt b/openmp/libomptarget/src/CMakeLists.txt --- a/openmp/libomptarget/src/CMakeLists.txt +++ b/openmp/libomptarget/src/CMakeLists.txt @@ -19,6 +19,7 @@ MemoryManager.cpp rtl.cpp omptarget.cpp + offloading.cpp ) # Build libomptarget library with libdl dependency. diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -11,287 +11,52 @@ //===----------------------------------------------------------------------===// #include "device.h" +#include "offloading.h" #include "private.h" -#include "rtl.h" #include #include #include -EXTERN int omp_get_num_devices(void) { - RTLsMtx->lock(); - size_t Devices_size = Devices.size(); - RTLsMtx->unlock(); +extern OffloadingPlugin *OP; - DP("Call to omp_get_num_devices returning %zd\n", Devices_size); +EXTERN int omp_get_num_devices(void) { return OP->omp_get_num_devices(); } - return Devices_size; -} - -EXTERN int omp_get_initial_device(void) { - int hostDevice = omp_get_num_devices(); - DP("Call to omp_get_initial_device returning %d\n", hostDevice); - return hostDevice; -} +EXTERN int omp_get_initial_device(void) { return OP->omp_get_initial_device(); } EXTERN void *omp_target_alloc(size_t size, int device_num) { - DP("Call to omp_target_alloc for device %d requesting %zu bytes\n", - device_num, size); - - if (size <= 0) { - DP("Call to omp_target_alloc with non-positive length\n"); - return NULL; - } - - void *rc = NULL; - - if (device_num == omp_get_initial_device()) { - rc = malloc(size); - DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc)); - return rc; - } - - if (!device_is_ready(device_num)) { - DP("omp_target_alloc returns NULL ptr\n"); - return NULL; - } - - rc = Devices[device_num].allocData(size); - DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc)); - return rc; + return OP->omp_target_alloc(size, device_num); } EXTERN void omp_target_free(void *device_ptr, int device_num) { - DP("Call to omp_target_free for device %d and address " DPxMOD "\n", - device_num, DPxPTR(device_ptr)); - - if (!device_ptr) { - DP("Call to omp_target_free with NULL ptr\n"); - return; - } - - if (device_num == omp_get_initial_device()) { - free(device_ptr); - DP("omp_target_free deallocated host ptr\n"); - return; - } - - if (!device_is_ready(device_num)) { - DP("omp_target_free returns, nothing to do\n"); - return; - } - - Devices[device_num].deleteData(device_ptr); - DP("omp_target_free deallocated device ptr\n"); + OP->omp_target_free(device_ptr, device_num); } EXTERN int omp_target_is_present(void *ptr, int device_num) { - DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n", - device_num, DPxPTR(ptr)); - - if (!ptr) { - DP("Call to omp_target_is_present with NULL ptr, returning false\n"); - return false; - } - - if (device_num == omp_get_initial_device()) { - DP("Call to omp_target_is_present on host, returning true\n"); - return true; - } - - RTLsMtx->lock(); - size_t Devices_size = Devices.size(); - RTLsMtx->unlock(); - if (Devices_size <= (size_t)device_num) { - DP("Call to omp_target_is_present with invalid device ID, returning " - "false\n"); - return false; - } - - DeviceTy& Device = Devices[device_num]; - bool IsLast; // not used - bool IsHostPtr; - void *TgtPtr = Device.getTgtPtrBegin(ptr, 0, IsLast, false, IsHostPtr); - int rc = (TgtPtr != NULL); - // Under unified memory the host pointer can be returned by the - // getTgtPtrBegin() function which means that there is no device - // corresponding point for ptr. This function should return false - // in that situation. - if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) - rc = !IsHostPtr; - DP("Call to omp_target_is_present returns %d\n", rc); - return rc; + return OP->omp_target_is_present(ptr, device_num); } EXTERN int omp_target_memcpy(void *dst, void *src, size_t length, size_t dst_offset, size_t src_offset, int dst_device, int src_device) { - DP("Call to omp_target_memcpy, dst device %d, src device %d, " - "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " - "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst), - DPxPTR(src), dst_offset, src_offset, length); - - if (!dst || !src || length <= 0) { - REPORT("Call to omp_target_memcpy with invalid arguments\n"); - return OFFLOAD_FAIL; - } - - if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) { - REPORT("omp_target_memcpy returns OFFLOAD_FAIL\n"); - return OFFLOAD_FAIL; - } - - if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) { - REPORT("omp_target_memcpy returns OFFLOAD_FAIL\n"); - return OFFLOAD_FAIL; - } - - int rc = OFFLOAD_SUCCESS; - void *srcAddr = (char *)src + src_offset; - void *dstAddr = (char *)dst + dst_offset; - - if (src_device == omp_get_initial_device() && - dst_device == omp_get_initial_device()) { - DP("copy from host to host\n"); - const void *p = memcpy(dstAddr, srcAddr, length); - if (p == NULL) - rc = OFFLOAD_FAIL; - } else if (src_device == omp_get_initial_device()) { - DP("copy from host to device\n"); - DeviceTy& DstDev = Devices[dst_device]; - rc = DstDev.submitData(dstAddr, srcAddr, length, nullptr); - } else if (dst_device == omp_get_initial_device()) { - DP("copy from device to host\n"); - DeviceTy& SrcDev = Devices[src_device]; - rc = SrcDev.retrieveData(dstAddr, srcAddr, length, nullptr); - } else { - DP("copy from device to device\n"); - DeviceTy &SrcDev = Devices[src_device]; - DeviceTy &DstDev = Devices[dst_device]; - // First try to use D2D memcpy which is more efficient. If fails, fall back - // to unefficient way. - if (SrcDev.isDataExchangable(DstDev)) { - rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, nullptr); - if (rc == OFFLOAD_SUCCESS) - return OFFLOAD_SUCCESS; - } - - void *buffer = malloc(length); - rc = SrcDev.retrieveData(buffer, srcAddr, length, nullptr); - if (rc == OFFLOAD_SUCCESS) - rc = DstDev.submitData(dstAddr, buffer, length, nullptr); - free(buffer); - } - - DP("omp_target_memcpy returns %d\n", rc); - return rc; + return OP->omp_target_memcpy(dst, src, length, dst_offset, src_offset, + dst_device, src_device); } EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size, int num_dims, const size_t *volume, const size_t *dst_offsets, const size_t *src_offsets, const size_t *dst_dimensions, const size_t *src_dimensions, int dst_device, int src_device) { - DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, " - "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", " - "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " - "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device, - src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets), - DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions), - DPxPTR(volume), element_size, num_dims); - - if (!(dst || src)) { - DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n", - INT_MAX); - return INT_MAX; - } - - if (!dst || !src || element_size < 1 || num_dims < 1 || !volume || - !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) { - REPORT("Call to omp_target_memcpy_rect with invalid arguments\n"); - return OFFLOAD_FAIL; - } - - int rc; - if (num_dims == 1) { - rc = omp_target_memcpy(dst, src, element_size * volume[0], - element_size * dst_offsets[0], element_size * src_offsets[0], - dst_device, src_device); - } else { - size_t dst_slice_size = element_size; - size_t src_slice_size = element_size; - for (int i=1; iomp_target_memcpy_rect(dst, src, element_size, num_dims, volume, + dst_offsets, src_offsets, dst_dimensions, + src_dimensions, dst_device, src_device); } EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size, size_t device_offset, int device_num) { - DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", " - "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n", - DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num); - - if (!host_ptr || !device_ptr || size <= 0) { - REPORT("Call to omp_target_associate_ptr with invalid arguments\n"); - return OFFLOAD_FAIL; - } - - if (device_num == omp_get_initial_device()) { - REPORT("omp_target_associate_ptr: no association possible on the host\n"); - return OFFLOAD_FAIL; - } - - if (!device_is_ready(device_num)) { - REPORT("omp_target_associate_ptr returns OFFLOAD_FAIL\n"); - return OFFLOAD_FAIL; - } - - DeviceTy& Device = Devices[device_num]; - void *device_addr = (void *)((uint64_t)device_ptr + (uint64_t)device_offset); - int rc = Device.associatePtr(host_ptr, device_addr, size); - DP("omp_target_associate_ptr returns %d\n", rc); - return rc; + return OP->omp_target_associate_ptr(host_ptr, device_ptr, size, device_offset, + device_num); } EXTERN int omp_target_disassociate_ptr(void *host_ptr, int device_num) { - DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", " - "device_num %d\n", DPxPTR(host_ptr), device_num); - - if (!host_ptr) { - REPORT("Call to omp_target_associate_ptr with invalid host_ptr\n"); - return OFFLOAD_FAIL; - } - - if (device_num == omp_get_initial_device()) { - REPORT( - "omp_target_disassociate_ptr: no association possible on the host\n"); - return OFFLOAD_FAIL; - } - - if (!device_is_ready(device_num)) { - REPORT("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n"); - return OFFLOAD_FAIL; - } - - DeviceTy& Device = Devices[device_num]; - int rc = Device.disassociatePtr(host_ptr); - DP("omp_target_disassociate_ptr returns %d\n", rc); - return rc; + return OP->omp_target_disassociate_ptr(host_ptr, device_num); } diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -165,18 +165,18 @@ void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount, bool HasCloseModifier, - bool HasPresentModifier); + bool HasPresentModifier, int64_t RequiresFlags); void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size); void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, bool UpdateRefCount, bool &IsHostPtr, - bool MustContain = false); + int64_t RequiresFlags, bool MustContain = false); int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete, - bool HasCloseModifier = false); + int64_t RequiresFlags, bool HasCloseModifier = false); int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); int disassociatePtr(void *HstPtrBegin); // calls to RTL - int32_t initOnce(); + int32_t initOnce(int64_t RequiresFlags); __tgt_target_table *load_binary(void *Img); // device memory allocation/deallocation routines @@ -216,13 +216,11 @@ private: // Call to RTL - void init(); // To be called only via DeviceTy::initOnce() + void + init(int64_t RequiresFlags); // To be called only via DeviceTy::initOnce() }; /// Map between Device ID (i.e. openmp device id) and its DeviceTy. typedef std::vector DevicesTy; -extern DevicesTy Devices; - -extern bool device_is_ready(int device_num); #endif diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -20,9 +20,6 @@ #include #include -/// Map between Device ID (i.e. openmp device id) and its DeviceTy. -DevicesTy Devices; - DeviceTy::DeviceTy(const DeviceTy &D) : DeviceID(D.DeviceID), RTL(D.RTL), RTLDeviceID(D.RTLDeviceID), IsInit(D.IsInit), InitFlag(), HasPendingGlobals(D.HasPendingGlobals), @@ -199,8 +196,8 @@ void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount, - bool HasCloseModifier, - bool HasPresentModifier) { + bool HasCloseModifier, bool HasPresentModifier, + int64_t RequiresFlags) { void *rc = NULL; IsHostPtr = false; IsNew = false; @@ -239,7 +236,7 @@ MESSAGE("device mapping required by 'present' map type modifier does not " "exist for host address " DPxMOD " (%" PRId64 " bytes)", DPxPTR(HstPtrBegin), Size); - } else if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + } else if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier) { // If unified shared memory is active, implicitly mapped variables that are // not privatized use host address. Any explicitly mapped variables also use @@ -284,7 +281,7 @@ // Decrement the reference counter if called from targetDataEnd. void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, bool UpdateRefCount, bool &IsHostPtr, - bool MustContain) { + int64_t RequiresFlags, bool MustContain) { void *rc = NULL; IsHostPtr = false; IsLast = false; @@ -305,7 +302,7 @@ Size, (UpdateRefCount ? " updated" : ""), HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str()); rc = (void *)tp; - } else if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { + } else if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { // If the value isn't found in the mapping and unified shared memory // is on then it means we have stumbled upon a value which we need to // use directly from the host. @@ -334,8 +331,8 @@ } int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete, - bool HasCloseModifier) { - if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier) + int64_t RequiresFlags, bool HasCloseModifier) { + if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier) return OFFLOAD_SUCCESS; // Check if the pointer is contained in any sub-nodes. int rc; @@ -367,10 +364,10 @@ } /// Init device, should not be called directly. -void DeviceTy::init() { +void DeviceTy::init(int64_t RequiresFlags) { // Make call to init_requires if it exists for this plugin. if (RTL->init_requires) - RTL->init_requires(RTLs->RequiresFlags); + RTL->init_requires(RequiresFlags); int32_t Ret = RTL->init_device(RTLDeviceID); if (Ret != OFFLOAD_SUCCESS) return; @@ -389,8 +386,8 @@ } /// Thread-safe method to initialize the device only once. -int32_t DeviceTy::initOnce() { - std::call_once(InitFlag, &DeviceTy::init, this); +int32_t DeviceTy::initOnce(int64_t RequiresFlags) { + std::call_once(InitFlag, &DeviceTy::init, this, RequiresFlags); // At this point, if IsInit is true, then either this thread or some other // thread in the past successfully initialized the device, so we can return @@ -506,33 +503,3 @@ return OFFLOAD_SUCCESS; } -/// Check whether a device has an associated RTL and initialize it if it's not -/// already initialized. -bool device_is_ready(int device_num) { - DP("Checking whether device %d is ready.\n", device_num); - // Devices.size() can only change while registering a new - // library, so try to acquire the lock of RTLs' mutex. - RTLsMtx->lock(); - size_t Devices_size = Devices.size(); - RTLsMtx->unlock(); - if (Devices_size <= (size_t)device_num) { - DP("Device ID %d does not have a matching RTL\n", device_num); - return false; - } - - // Get device info - DeviceTy &Device = Devices[device_num]; - - DP("Is the device %d (local ID %d) initialized? %d\n", device_num, - Device.RTLDeviceID, Device.IsInit); - - // Init the device if not done before - if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) { - DP("Failed to init device %d\n", device_num); - return false; - } - - DP("Device %d is ready to use.\n", device_num); - - return true; -} diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -11,436 +11,233 @@ // //===----------------------------------------------------------------------===// -#include "device.h" -#include "private.h" -#include "rtl.h" +#include "offloading.h" #include #include #include #include -// Store target policy (disabled, mandatory, default) -kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default; -std::mutex TargetOffloadMtx; - -//////////////////////////////////////////////////////////////////////////////// -/// manage the success or failure of a target construct -static void HandleDefaultTargetOffload() { - TargetOffloadMtx.lock(); - if (TargetOffloadPolicy == tgt_default) { - if (omp_get_num_devices() > 0) { - DP("Default TARGET OFFLOAD policy is now mandatory " - "(devices were found)\n"); - TargetOffloadPolicy = tgt_mandatory; - } else { - DP("Default TARGET OFFLOAD policy is now disabled " - "(no devices were found)\n"); - TargetOffloadPolicy = tgt_disabled; - } - } - TargetOffloadMtx.unlock(); -} - -static int IsOffloadDisabled() { - if (TargetOffloadPolicy == tgt_default) HandleDefaultTargetOffload(); - return TargetOffloadPolicy == tgt_disabled; -} - -static void HandleTargetOutcome(bool success) { - switch (TargetOffloadPolicy) { - case tgt_disabled: - if (success) { - FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); - } - break; - case tgt_default: - FATAL_MESSAGE0(1, "default offloading policy must be switched to " - "mandatory or disabled"); - break; - case tgt_mandatory: - if (!success) { - if (getInfoLevel() > 1) - for (const auto &Device : Devices) - dumpTargetPointerMappings(Device); - else - FAILURE_MESSAGE("run with env LIBOMPTARGET_INFO>1 to dump host-target" - "pointer maps\n"); - - FATAL_MESSAGE0(1, "failure of target construct while offloading is mandatory"); - } - break; - } -} +extern OffloadingPlugin *OP; //////////////////////////////////////////////////////////////////////////////// /// adds requires flags EXTERN void __tgt_register_requires(int64_t flags) { - RTLs->RegisterRequires(flags); + OP->RegisterRequires(flags); } //////////////////////////////////////////////////////////////////////////////// /// adds a target shared library to the target execution image -EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) { - RTLs->RegisterLib(desc); -} +EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) { OP->RegisterLib(desc); } //////////////////////////////////////////////////////////////////////////////// /// unloads a target shared library EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) { - RTLs->UnregisterLib(desc); + OP->UnregisterLib(desc); } /// creates host-to-target data mapping, stores it in the /// libomptarget.so internal structure (an entry in a stack of data maps) /// and passes the data to the device. EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - __tgt_target_data_begin_mapper(device_id, arg_num, args_base, args, - arg_sizes, arg_types, nullptr); + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types) { + OP->__tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes, + arg_types); } EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - int32_t depNum, void *depList, int32_t noAliasDepNum, - void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - __tgt_target_data_begin_mapper(device_id, arg_num, args_base, args, - arg_sizes, arg_types, nullptr); + void **args_base, void **args, + int64_t *arg_sizes, + int64_t *arg_types, int32_t depNum, + void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + return OP->__tgt_target_data_begin_nowait( + device_id, arg_num, args_base, args, arg_sizes, arg_types, depNum, + depList, noAliasDepNum, noAliasDepList); } EXTERN void __tgt_target_data_begin_mapper(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - void **arg_mappers) { - if (IsOffloadDisabled()) return; - - DP("Entering data begin region for device %" PRId64 " with %d mappings\n", - device_id, arg_num); - - // No devices available? - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - DP("Use default device id %" PRId64 "\n", device_id); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); - return; - } - - DeviceTy &Device = Devices[device_id]; - -#ifdef OMPTARGET_DEBUG - for (int i = 0; i < arg_num; ++i) { - DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 - ", Type=0x%" PRIx64 "\n", - i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i]); - } -#endif - - int rc = targetDataBegin(Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_mappers, nullptr); - HandleTargetOutcome(rc == OFFLOAD_SUCCESS); + void **args_base, void **args, + int64_t *arg_sizes, + int64_t *arg_types, + void **arg_mappers) { + OP->__tgt_target_data_begin_mapper(device_id, arg_num, args_base, args, + arg_sizes, arg_types, arg_mappers); } -EXTERN void __tgt_target_data_begin_nowait_mapper(int64_t device_id, - int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, void **arg_mappers, int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - __tgt_target_data_begin_mapper(device_id, arg_num, args_base, args, - arg_sizes, arg_types, arg_mappers); +EXTERN void __tgt_target_data_begin_nowait_mapper( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, int32_t depNum, + void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + OP->__tgt_target_data_begin_nowait_mapper( + device_id, arg_num, args_base, args, arg_sizes, arg_types, arg_mappers, + depNum, depList, noAliasDepNum, noAliasDepList); } /// passes data from the target, releases target memory and destroys /// the host-target mapping (top entry from the stack of data maps) /// created by the last __tgt_target_data_begin. EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - __tgt_target_data_end_mapper(device_id, arg_num, args_base, args, arg_sizes, - arg_types, nullptr); + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types) { + OP->__tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes, + arg_types); } EXTERN void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - int32_t depNum, void *depList, int32_t noAliasDepNum, - void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - __tgt_target_data_end_mapper(device_id, arg_num, args_base, args, arg_sizes, - arg_types, nullptr); + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList) { + OP->__tgt_target_data_end_nowait(device_id, arg_num, args_base, args, + arg_sizes, arg_types, depNum, depList, + noAliasDepNum, noAliasDepList); } EXTERN void __tgt_target_data_end_mapper(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - void **arg_mappers) { - if (IsOffloadDisabled()) return; - DP("Entering data end region with %d mappings\n", arg_num); - - // No devices available? - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - RTLsMtx->lock(); - size_t Devices_size = Devices.size(); - RTLsMtx->unlock(); - if (Devices_size <= (size_t)device_id) { - DP("Device ID %" PRId64 " does not have a matching RTL.\n", device_id); - HandleTargetOutcome(false); - return; - } - - DeviceTy &Device = Devices[device_id]; - if (!Device.IsInit) { - DP("Uninit device: ignore"); - HandleTargetOutcome(false); - return; - } - -#ifdef OMPTARGET_DEBUG - for (int i=0; i__tgt_target_data_end_mapper(device_id, arg_num, args_base, args, + arg_sizes, arg_types, arg_mappers); } -EXTERN void __tgt_target_data_end_nowait_mapper(int64_t device_id, - int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, void **arg_mappers, int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - __tgt_target_data_end_mapper(device_id, arg_num, args_base, args, arg_sizes, - arg_types, arg_mappers); +EXTERN void __tgt_target_data_end_nowait_mapper( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, int32_t depNum, + void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + OP->__tgt_target_data_end_nowait_mapper( + device_id, arg_num, args_base, args, arg_sizes, arg_types, arg_mappers, + depNum, depList, noAliasDepNum, noAliasDepList); } EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - __tgt_target_data_update_mapper(device_id, arg_num, args_base, args, - arg_sizes, arg_types, nullptr); + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types) { + OP->__tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes, + arg_types); } -EXTERN void __tgt_target_data_update_nowait(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - int32_t depNum, void *depList, int32_t noAliasDepNum, - void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - __tgt_target_data_update_mapper(device_id, arg_num, args_base, args, - arg_sizes, arg_types, nullptr); +EXTERN void __tgt_target_data_update_nowait( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + OP->__tgt_target_data_update_nowait(device_id, arg_num, args_base, args, + arg_sizes, arg_types, depNum, depList, + noAliasDepNum, noAliasDepList); } EXTERN void __tgt_target_data_update_mapper(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - void **arg_mappers) { - if (IsOffloadDisabled()) return; - DP("Entering data update with %d mappings\n", arg_num); - - // No devices available? - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); - return; - } - - DeviceTy& Device = Devices[device_id]; - int rc = target_data_update(Device, arg_num, args_base, - args, arg_sizes, arg_types, arg_mappers); - HandleTargetOutcome(rc == OFFLOAD_SUCCESS); + void **args_base, void **args, + int64_t *arg_sizes, + int64_t *arg_types, + void **arg_mappers) { + OP->__tgt_target_data_update_mapper(device_id, arg_num, args_base, args, + arg_sizes, arg_types, arg_mappers); } -EXTERN void __tgt_target_data_update_nowait_mapper(int64_t device_id, - int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, void **arg_mappers, int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - __tgt_target_data_update_mapper(device_id, arg_num, args_base, args, - arg_sizes, arg_types, arg_mappers); +EXTERN void __tgt_target_data_update_nowait_mapper( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, int32_t depNum, + void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + OP->__tgt_target_data_update_nowait_mapper( + device_id, arg_num, args_base, args, arg_sizes, arg_types, arg_mappers, + depNum, depList, noAliasDepNum, noAliasDepList); } EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - return __tgt_target_mapper(device_id, host_ptr, arg_num, args_base, args, - arg_sizes, arg_types, nullptr); + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types) { + return OP->__tgt_target(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types); } EXTERN int __tgt_target_nowait(int64_t device_id, void *host_ptr, - int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum, - void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - return __tgt_target_mapper(device_id, host_ptr, arg_num, args_base, args, - arg_sizes, arg_types, nullptr); + int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + return OP->__tgt_target_nowait(device_id, host_ptr, arg_num, args_base, args, + arg_types, arg_types, depNum, depList, + noAliasDepNum, noAliasDepList); } EXTERN int __tgt_target_mapper(int64_t device_id, void *host_ptr, - int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, void **arg_mappers) { - if (IsOffloadDisabled()) return OFFLOAD_FAIL; - DP("Entering target region with entry point " DPxMOD " and device Id %" - PRId64 "\n", DPxPTR(host_ptr), device_id); - - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - REPORT("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); - return OFFLOAD_FAIL; - } - -#ifdef OMPTARGET_DEBUG - for (int i=0; i__tgt_target_mapper(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, arg_mappers); } EXTERN int __tgt_target_nowait_mapper(int64_t device_id, void *host_ptr, - int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, void **arg_mappers, int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - return __tgt_target_mapper(device_id, host_ptr, arg_num, args_base, args, - arg_sizes, arg_types, arg_mappers); + int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, + int64_t *arg_types, void **arg_mappers, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList) { + return OP->__tgt_target_nowait_mapper( + device_id, host_ptr, arg_num, args_base, args, arg_sizes, arg_types, + arg_mappers, depNum, depList, noAliasDepNum, noAliasDepList); } EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr, - int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, int32_t team_num, int32_t thread_limit) { - return __tgt_target_teams_mapper(device_id, host_ptr, arg_num, args_base, - args, arg_sizes, arg_types, nullptr, team_num, thread_limit); + int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t team_num, int32_t thread_limit) { + return OP->__tgt_target_teams(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, team_num, thread_limit); } - EXTERN int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr, - int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum, - void *depList, int32_t noAliasDepNum, void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - return __tgt_target_teams_mapper(device_id, host_ptr, arg_num, args_base, - args, arg_sizes, arg_types, nullptr, team_num, thread_limit); + int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t team_num, + int32_t thread_limit, int32_t depNum, + void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + return OP->__tgt_target_teams_nowait( + device_id, host_ptr, arg_num, args_base, args, arg_sizes, arg_types, + team_num, thread_limit, depNum, depList, noAliasDepNum, noAliasDepList); } EXTERN int __tgt_target_teams_mapper(int64_t device_id, void *host_ptr, - int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, void **arg_mappers, int32_t team_num, int32_t thread_limit) { - if (IsOffloadDisabled()) return OFFLOAD_FAIL; - DP("Entering target region with entry point " DPxMOD " and device Id %" - PRId64 "\n", DPxPTR(host_ptr), device_id); - - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - REPORT("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); - return OFFLOAD_FAIL; - } - -#ifdef OMPTARGET_DEBUG - for (int i=0; i 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - return __tgt_target_teams_mapper(device_id, host_ptr, arg_num, args_base, - args, arg_sizes, arg_types, arg_mappers, team_num, thread_limit); + int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, + int64_t *arg_types, void **arg_mappers, + int32_t team_num, int32_t thread_limit) { + return OP->__tgt_target_teams_mapper(device_id, host_ptr, arg_num, args_base, + args, arg_sizes, arg_types, arg_mappers, + team_num, thread_limit); +} + +EXTERN int __tgt_target_teams_nowait_mapper( + int64_t device_id, void *host_ptr, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, + int32_t team_num, int32_t thread_limit, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + return OP->__tgt_target_teams_nowait_mapper( + device_id, host_ptr, arg_num, args_base, args, arg_sizes, arg_types, + arg_mappers, team_num, thread_limit, depNum, depList, noAliasDepNum, + noAliasDepList); } // Get the current number of components for a user-defined mapper. EXTERN int64_t __tgt_mapper_num_components(void *rt_mapper_handle) { - auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle; - int64_t size = MapperComponentsPtr->Components.size(); - DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n", - DPxPTR(rt_mapper_handle), size); - return size; + return OP->__tgt_mapper_num_components(rt_mapper_handle); } // Push back one component for a user-defined mapper. EXTERN void __tgt_push_mapper_component(void *rt_mapper_handle, void *base, void *begin, int64_t size, int64_t type) { - DP("__tgt_push_mapper_component(Handle=" DPxMOD - ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 - ", Type=0x%" PRIx64 ").\n", - DPxPTR(rt_mapper_handle), DPxPTR(base), DPxPTR(begin), size, type); - auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle; - MapperComponentsPtr->Components.push_back( - MapComponentInfoTy(base, begin, size, type)); + return OP->__tgt_push_mapper_component(rt_mapper_handle, base, begin, size, + type); } EXTERN void __kmpc_push_target_tripcount(int64_t device_id, - uint64_t loop_tripcount) { - if (IsOffloadDisabled()) - return; - - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); - return; - } - - DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id, - loop_tripcount); - TblMapMtx->lock(); - Devices[device_id].LoopTripCnt.emplace(__kmpc_global_thread_num(NULL), - loop_tripcount); - TblMapMtx->unlock(); + uint64_t loop_tripcount) { + OP->__kmpc_push_target_tripcount(device_id, loop_tripcount); } diff --git a/openmp/libomptarget/src/offloading.h b/openmp/libomptarget/src/offloading.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/src/offloading.h @@ -0,0 +1,274 @@ +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include +#include +#include +#include + +class OffloadingPlugin { +public: + DevicesTy Devices; + std::mutex *RTLsMtx; + + HostEntriesBeginToTransTableTy *HostEntriesBeginToTransTable; + std::mutex *TrlTblMtx; + + HostPtrToTableMapTy *HostPtrToTableMap; + std::mutex *TblMapMtx; + + std::mutex *TargetOffloadMtx; + + // FIXME: this needs to be set by the constructor + int64_t host_device = -10; + + // Mutex-like object to guarantee thread-safety and unique initialization + // (i.e. the library attempts to load the RTLs (plugins) only once). + std::once_flag initFlag; + void LoadRTLs(); // not thread-safe + + // Store target policy (disabled, mandatory, default) + kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default; + + // List of the detected runtime libraries. + std::vector AllRTLs; + + // Array of pointers to the detected runtime libraries that have compatible + // binaries. + std::vector UsedRTLs; + + int64_t RequiresFlags = OMP_REQ_UNDEFINED; + + // Register the clauses of the requires directive. + void RegisterRequires(int64_t flags); + + // Register a shared library with all (compatible) RTLs. + void RegisterLib(__tgt_bin_desc *desc); + + // Unregister a shared library from all RTLs. + void UnregisterLib(__tgt_bin_desc *desc); + + OffloadingPlugin() { + RTLsMtx = new std::mutex(); + + HostEntriesBeginToTransTable = new HostEntriesBeginToTransTableTy(); + TrlTblMtx = new std::mutex(); + + HostPtrToTableMap = new HostPtrToTableMapTy(); + TblMapMtx = new std::mutex(); + + TargetOffloadMtx = new std::mutex(); + }; + + ~OffloadingPlugin() { + delete RTLsMtx; + + delete HostEntriesBeginToTransTable; + delete TrlTblMtx; + + delete HostPtrToTableMap; + delete TblMapMtx; + + delete TargetOffloadMtx; + } + + //////////////////////////////////////////////////////////////////////////////// + /// manage the success or failure of a target construct + void HandleDefaultTargetOffload(); + int IsOffloadDisabled(); + void HandleTargetOutcome(bool success); + + //////////////////////////////////////////////////////////////////////////////// + // OMP API + //////////////////////////////////////////////////////////////////////////////// + + int omp_get_num_devices(void); + int omp_get_initial_device(void); + void *omp_target_alloc(size_t size, int device_num); + void omp_target_free(void *device_ptr, int device_num); + int omp_target_is_present(void *ptr, int device_num); + int omp_target_memcpy(void *dst, void *src, size_t length, size_t dst_offset, + size_t src_offset, int dst_device, int src_device); + int omp_target_memcpy_rect(void *dst, void *src, size_t element_size, + int num_dims, const size_t *volume, + const size_t *dst_offsets, + const size_t *src_offsets, + const size_t *dst_dimensions, + const size_t *src_dimensions, int dst_device, + int src_device); + int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size, + size_t device_offset, int device_num); + int omp_target_disassociate_ptr(void *host_ptr, int device_num); + + //////////////////////////////////////////////////////////////////////////////// + // Function Implementations + //////////////////////////////////////////////////////////////////////////////// + + /// creates host-to-target data mapping, stores it in the + /// libomptarget.so internal structure (an entry in a stack of data maps) + /// and passes the data to the device. + void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types); + + void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); + + void __tgt_target_data_begin_mapper(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers); + + void __tgt_target_data_begin_nowait_mapper( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, + int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList); + + /// passes data from the target, releases target memory and destroys + /// the host-target mapping (top entry from the stack of data maps) + /// created by the last __tgt_target_data_begin. + void __tgt_target_data_end(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types); + + void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); + + void __tgt_target_data_end_mapper(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers); + + void __tgt_target_data_end_nowait_mapper(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, + int64_t *arg_types, + void **arg_mappers, int32_t depNum, + void *depList, int32_t noAliasDepNum, + void *noAliasDepList); + + void __tgt_target_data_update(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types); + + void __tgt_target_data_update_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); + + void __tgt_target_data_update_mapper(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers); + + void __tgt_target_data_update_nowait_mapper( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, + int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList); + + int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types); + + int __tgt_target_nowait(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); + + int __tgt_target_mapper(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, void **arg_mappers); + + int __tgt_target_nowait_mapper(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers, int32_t depNum, + void *depList, int32_t noAliasDepNum, + void *noAliasDepList); + + int __tgt_target_teams(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t team_num, + int32_t thread_limit); + + int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t team_num, int32_t thread_limit, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); + + int __tgt_target_teams_mapper(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers, int32_t team_num, + int32_t thread_limit); + + int __tgt_target_teams_nowait_mapper( + int64_t device_id, void *host_ptr, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, + int32_t team_num, int32_t thread_limit, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); + + // Get the current number of components for a user-defined mapper. + int64_t __tgt_mapper_num_components(void *rt_mapper_handle); + + // Push back one component for a user-defined mapper. + void __tgt_push_mapper_component(void *rt_mapper_handle, void *base, + void *begin, int64_t size, int64_t type); + + void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount); + + //////////////////////////////////////////////////////////////////////////// + // Devices + //////////////////////////////////////////////////////////////////////////// + + /// Check whether a device has an associated RTL and initialize it if it's not + /// already initialized. + bool device_is_ready(int device_num); + + int CheckDeviceAndCtors(int64_t device_id); + + /// Map global data and execute pending ctors + int InitLibrary(DeviceTy &Device); + + void RegisterImageIntoTranslationTable(TranslationTable &TT, RTLInfoTy &RTL, + __tgt_device_image *image); + + void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc, + __tgt_device_image *img, + RTLInfoTy *RTL); +}; + +int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers, OffloadingPlugin *OP, + __tgt_async_info *async_info_ptr); + +int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases, + void **Args, int64_t *ArgSizes, int64_t *ArgTypes, + void **ArgMappers, OffloadingPlugin *OP, + __tgt_async_info *AsyncInfo); + +int target_data_update(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers, OffloadingPlugin *OP, + __tgt_async_info *async_info_ptr = nullptr); + +int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases, + void **Args, int64_t *ArgSizes, int64_t *ArgTypes, void **ArgMappers, + int32_t TeamNum, int32_t ThreadLimit, int IsTeamConstruct, + OffloadingPlugin *OP); diff --git a/openmp/libomptarget/src/offloading.cpp b/openmp/libomptarget/src/offloading.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/src/offloading.cpp @@ -0,0 +1,1295 @@ +#include "offloading.h" + +// List of all plugins that can support offloading. +static const char *RTLNames[] = { + /* PowerPC target */ "libomptarget.rtl.ppc64.so", + /* x86_64 target */ "libomptarget.rtl.x86_64.so", + /* CUDA target */ "libomptarget.rtl.cuda.so", + /* AArch64 target */ "libomptarget.rtl.aarch64.so", + /* SX-Aurora VE target */ "libomptarget.rtl.ve.so", + /* AMDGPU target */ "libomptarget.rtl.amdgpu.so", + /* Remote target */ "libomptarget.rtl.rpc.so"}; + +//////////////////////////////////////////////////////////////////////////////// +/// manage the success or failure of a target construct +void OffloadingPlugin::HandleDefaultTargetOffload() { + TargetOffloadMtx->lock(); + if (TargetOffloadPolicy == tgt_default) { + if (omp_get_num_devices() > 0) { + DP("Default TARGET OFFLOAD policy is now mandatory " + "(devices were found)\n"); + TargetOffloadPolicy = tgt_mandatory; + } else { + DP("Default TARGET OFFLOAD policy is now disabled " + "(no devices were found)\n"); + TargetOffloadPolicy = tgt_disabled; + } + } + TargetOffloadMtx->unlock(); +} + +int OffloadingPlugin::IsOffloadDisabled() { + if (TargetOffloadPolicy == tgt_default) + HandleDefaultTargetOffload(); + return TargetOffloadPolicy == tgt_disabled; +} + +void OffloadingPlugin::HandleTargetOutcome(bool success) { + switch (TargetOffloadPolicy) { + case tgt_disabled: + if (success) { + FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); + } + break; + case tgt_default: + FATAL_MESSAGE0(1, "default offloading policy must be switched to " + "mandatory or disabled"); + break; + case tgt_mandatory: + if (!success) { + if (getInfoLevel() > 1) + for (const auto &Device : Devices) + dumpTargetPointerMappings(Device); + else + FAILURE_MESSAGE("run with env LIBOMPTARGET_INFO>1 to dump host-target" + "pointer maps\n"); + + FATAL_MESSAGE0( + 1, "failure of target construct while offloading is mandatory"); + } + break; + } +} + +/// creates host-to-target data mapping, stores it in the +/// libomptarget.so internal structure (an entry in a stack of data maps) +/// and passes the data to the device. +void OffloadingPlugin::__tgt_target_data_begin(int64_t device_id, + int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, + int64_t *arg_types) { + __tgt_target_data_begin_mapper(device_id, arg_num, args_base, args, arg_sizes, + arg_types, nullptr); +} + +void OffloadingPlugin::__tgt_target_data_begin_nowait( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + OffloadingPlugin::__tgt_target_data_begin_mapper( + device_id, arg_num, args_base, args, arg_sizes, arg_types, nullptr); +} + +void OffloadingPlugin::__tgt_target_data_begin_mapper( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers) { + if (IsOffloadDisabled()) + return; + + DP("Entering data begin region for device %" PRId64 " with %d mappings\n", + device_id, arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + DP("Use default device id %" PRId64 "\n", device_id); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return; + } + + DeviceTy &Device = Devices[device_id]; + +#ifdef OMPTARGET_DEBUG + for (int i = 0; i < arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 "\n", + i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i]); + } +#endif + + int rc = targetDataBegin(Device, arg_num, args_base, args, arg_sizes, + arg_types, arg_mappers, this, nullptr); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); +} + +void OffloadingPlugin::__tgt_target_data_begin_nowait_mapper( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, int32_t depNum, + void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + __tgt_target_data_begin_mapper(device_id, arg_num, args_base, args, arg_sizes, + arg_types, arg_mappers); +} + +/// passes data from the target, releases target memory and destroys +/// the host-target mapping (top entry from the stack of data maps) +/// created by the last __tgt_target_data_begin. +void OffloadingPlugin::__tgt_target_data_end(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, + int64_t *arg_types) { + __tgt_target_data_end_mapper(device_id, arg_num, args_base, args, arg_sizes, + arg_types, nullptr); +} + +void OffloadingPlugin::__tgt_target_data_end_nowait( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + OffloadingPlugin::__tgt_target_data_end_mapper( + device_id, arg_num, args_base, args, arg_sizes, arg_types, nullptr); +} + +void OffloadingPlugin::__tgt_target_data_end_mapper( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers) { + if (IsOffloadDisabled()) + return; + DP("Entering data end region with %d mappings\n", arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + RTLsMtx->lock(); + size_t Devices_size = Devices.size(); + RTLsMtx->unlock(); + if (Devices_size <= (size_t)device_id) { + DP("Device ID %" PRId64 " does not have a matching RTL.\n", device_id); + HandleTargetOutcome(false); + return; + } + + DeviceTy &Device = Devices[device_id]; + if (!Device.IsInit) { + DP("Uninit device: ignore"); + HandleTargetOutcome(false); + return; + } + +#ifdef OMPTARGET_DEBUG + for (int i = 0; i < arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 "\n", + i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i]); + } +#endif + + int rc = targetDataEnd(Device, arg_num, args_base, args, arg_sizes, arg_types, + arg_mappers, this, nullptr); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); +} + +void OffloadingPlugin::__tgt_target_data_end_nowait_mapper( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, int32_t depNum, + void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + OffloadingPlugin::__tgt_target_data_end_mapper( + device_id, arg_num, args_base, args, arg_sizes, arg_types, arg_mappers); +} + +void OffloadingPlugin::__tgt_target_data_update(int64_t device_id, + int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, + int64_t *arg_types) { + __tgt_target_data_update_mapper(device_id, arg_num, args_base, args, + arg_sizes, arg_types, nullptr); +} + +void OffloadingPlugin::__tgt_target_data_update_nowait( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + __tgt_target_data_update_mapper(device_id, arg_num, args_base, args, + arg_sizes, arg_types, nullptr); +} + +void OffloadingPlugin::__tgt_target_data_update_mapper( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers) { + if (IsOffloadDisabled()) + return; + DP("Entering data update with %d mappings\n", arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return; + } + + DeviceTy &Device = Devices[device_id]; + int rc = target_data_update(Device, arg_num, args_base, args, arg_sizes, + arg_types, arg_mappers, this, nullptr); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); +} + +void OffloadingPlugin::__tgt_target_data_update_nowait_mapper( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, int32_t depNum, + void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + __tgt_target_data_update_mapper(device_id, arg_num, args_base, args, + arg_sizes, arg_types, arg_mappers); +} + +int OffloadingPlugin::__tgt_target(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, + int64_t *arg_types) { + return __tgt_target_mapper(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, nullptr); +} + +int OffloadingPlugin::__tgt_target_nowait(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t depNum, + void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + return __tgt_target_mapper(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, nullptr); +} + +int OffloadingPlugin::__tgt_target_mapper(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, + int64_t *arg_types, + void **arg_mappers) { + if (IsOffloadDisabled()) + return OFFLOAD_FAIL; + DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 + "\n", + DPxPTR(host_ptr), device_id); + + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + REPORT("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return OFFLOAD_FAIL; + } + +#ifdef OMPTARGET_DEBUG + for (int i = 0; i < arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 "\n", + i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i]); + } +#endif + + int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types, arg_mappers, 0, 0, false /*team*/, this); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); + return rc; +} + +int OffloadingPlugin::__tgt_target_nowait_mapper( + int64_t device_id, void *host_ptr, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, + int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + return __tgt_target_mapper(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, arg_mappers); +} + +int OffloadingPlugin::__tgt_target_teams(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t team_num, + int32_t thread_limit) { + return __tgt_target_teams_mapper(device_id, host_ptr, arg_num, args_base, + args, arg_sizes, arg_types, nullptr, + team_num, thread_limit); +} + +int OffloadingPlugin::__tgt_target_teams_nowait( + int64_t device_id, void *host_ptr, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, int32_t team_num, + int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + return __tgt_target_teams_mapper(device_id, host_ptr, arg_num, args_base, + args, arg_sizes, arg_types, nullptr, + team_num, thread_limit); +} + +int OffloadingPlugin::__tgt_target_teams_mapper( + int64_t device_id, void *host_ptr, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, + int32_t team_num, int32_t thread_limit) { + if (IsOffloadDisabled()) + return OFFLOAD_FAIL; + DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 + "\n", + DPxPTR(host_ptr), device_id); + + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + REPORT("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return OFFLOAD_FAIL; + } + +#ifdef OMPTARGET_DEBUG + for (int i = 0; i < arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 "\n", + i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i]); + } +#endif + + int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types, arg_mappers, team_num, thread_limit, true /*team*/, + this); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); + + return rc; +} + +int OffloadingPlugin::__tgt_target_teams_nowait_mapper( + int64_t device_id, void *host_ptr, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, + int32_t team_num, int32_t thread_limit, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + return __tgt_target_teams_mapper(device_id, host_ptr, arg_num, args_base, + args, arg_sizes, arg_types, arg_mappers, + team_num, thread_limit); +} + +// Get the current number of components for a user-defined mapper. +int64_t OffloadingPlugin::__tgt_mapper_num_components(void *rt_mapper_handle) { + auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle; + int64_t size = MapperComponentsPtr->Components.size(); + DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n", + DPxPTR(rt_mapper_handle), size); + return size; +} + +// Push back one component for a user-defined mapper. +void OffloadingPlugin::__tgt_push_mapper_component(void *rt_mapper_handle, + void *base, void *begin, + int64_t size, int64_t type) { + DP("__tgt_push_mapper_component(Handle=" DPxMOD + ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 ").\n", + DPxPTR(rt_mapper_handle), DPxPTR(base), DPxPTR(begin), size, type); + auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle; + MapperComponentsPtr->Components.push_back( + MapComponentInfoTy(base, begin, size, type)); +} + +void OffloadingPlugin::__kmpc_push_target_tripcount(int64_t device_id, + uint64_t loop_tripcount) { + if (IsOffloadDisabled()) + return; + + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return; + } + + DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id, + loop_tripcount); + TblMapMtx->lock(); + Devices[device_id].LoopTripCnt.emplace(__kmpc_global_thread_num(NULL), + loop_tripcount); + TblMapMtx->unlock(); +} + +bool OffloadingPlugin::device_is_ready(int device_num) { + DP("Checking whether device %d is ready.\n", device_num); + // Devices.size() can only change while registering a new + // library, so try to acquire the lock of RTLs' mutex. + RTLsMtx->lock(); + size_t Devices_size = Devices.size(); + RTLsMtx->unlock(); + if (Devices_size <= (size_t)device_num) { + DP("Device ID %d does not have a matching RTL\n", device_num); + return false; + } + + // Get device info + DeviceTy &Device = Devices[device_num]; + + DP("Is the device %d (local ID %d) initialized? %d\n", device_num, + Device.RTLDeviceID, Device.IsInit); + + // Init the device if not done before + if (!Device.IsInit && Device.initOnce(RequiresFlags) != OFFLOAD_SUCCESS) { + DP("Failed to init device %d\n", device_num); + return false; + } + + DP("Device %d is ready to use.\n", device_num); + + return true; +} + +void OffloadingPlugin::LoadRTLs() { + // Parse environment variable OMP_TARGET_OFFLOAD (if set) + TargetOffloadPolicy = (kmp_target_offload_kind_t)__kmpc_get_target_offload(); + if (TargetOffloadPolicy == tgt_disabled) { + return; + } + + DP("Loading RTLs...\n"); + + // Attempt to open all the plugins and, if they exist, check if the interface + // is correct and if they are supporting any devices. + for (auto *Name : RTLNames) { + DP("Loading library '%s'...\n", Name); + void *dynlib_handle = dlopen(Name, RTLD_NOW); + + if (!dynlib_handle) { + // Library does not exist or cannot be found. + DP("Unable to load library '%s': %s!\n", Name, dlerror()); + continue; + } + + DP("Successfully loaded library '%s'!\n", Name); + + // Retrieve the RTL information from the runtime library. + RTLInfoTy R; + + R.LibraryHandler = dynlib_handle; + R.isUsed = false; + +#ifdef OMPTARGET_DEBUG + R.RTLName = Name; +#endif + + if (!(*((void **)&R.is_valid_binary) = + dlsym(dynlib_handle, "__tgt_rtl_is_valid_binary"))) + continue; + if (!(*((void **)&R.number_of_devices) = + dlsym(dynlib_handle, "__tgt_rtl_number_of_devices"))) + continue; + if (!(*((void **)&R.init_device) = + dlsym(dynlib_handle, "__tgt_rtl_init_device"))) + continue; + if (!(*((void **)&R.load_binary) = + dlsym(dynlib_handle, "__tgt_rtl_load_binary"))) + continue; + if (!(*((void **)&R.data_alloc) = + dlsym(dynlib_handle, "__tgt_rtl_data_alloc"))) + continue; + if (!(*((void **)&R.data_submit) = + dlsym(dynlib_handle, "__tgt_rtl_data_submit"))) + continue; + if (!(*((void **)&R.data_retrieve) = + dlsym(dynlib_handle, "__tgt_rtl_data_retrieve"))) + continue; + if (!(*((void **)&R.data_delete) = + dlsym(dynlib_handle, "__tgt_rtl_data_delete"))) + continue; + if (!(*((void **)&R.run_region) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_region"))) + continue; + if (!(*((void **)&R.run_team_region) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region"))) + continue; + + // Optional functions + *((void **)&R.init_requires) = + dlsym(dynlib_handle, "__tgt_rtl_init_requires"); + *((void **)&R.data_submit_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_submit_async"); + *((void **)&R.data_retrieve_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_retrieve_async"); + *((void **)&R.run_region_async) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_region_async"); + *((void **)&R.run_team_region_async) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async"); + *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize"); + *((void **)&R.data_exchange) = + dlsym(dynlib_handle, "__tgt_rtl_data_exchange"); + *((void **)&R.data_exchange_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_exchange_async"); + *((void **)&R.is_data_exchangable) = + dlsym(dynlib_handle, "__tgt_rtl_is_data_exchangable"); + + // No devices are supported by this RTL? + if (!(R.NumberOfDevices = R.number_of_devices())) { + DP("No devices supported in this RTL\n"); + continue; + } + + DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(), + R.NumberOfDevices); + + // The RTL is valid! Will save the information in the RTLs list. + AllRTLs.push_back(R); + } + + DP("RTLs loaded!\n"); + + return; +} + +void OffloadingPlugin::RegisterRequires(int64_t flags) { + // TODO: add more elaborate check. + // Minimal check: only set requires flags if previous value + // is undefined. This ensures that only the first call to this + // function will set the requires flags. All subsequent calls + // will be checked for compatibility. + assert(flags != OMP_REQ_UNDEFINED && + "illegal undefined flag for requires directive!"); + if (RequiresFlags == OMP_REQ_UNDEFINED) { + RequiresFlags = flags; + return; + } + + // If multiple compilation units are present enforce + // consistency across all of them for require clauses: + // - reverse_offload + // - unified_address + // - unified_shared_memory + if ((RequiresFlags & OMP_REQ_REVERSE_OFFLOAD) != + (flags & OMP_REQ_REVERSE_OFFLOAD)) { + FATAL_MESSAGE0( + 1, "'#pragma omp requires reverse_offload' not used consistently!"); + } + if ((RequiresFlags & OMP_REQ_UNIFIED_ADDRESS) != + (flags & OMP_REQ_UNIFIED_ADDRESS)) { + FATAL_MESSAGE0( + 1, "'#pragma omp requires unified_address' not used consistently!"); + } + if ((RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) != + (flags & OMP_REQ_UNIFIED_SHARED_MEMORY)) { + FATAL_MESSAGE0( + 1, + "'#pragma omp requires unified_shared_memory' not used consistently!"); + } + + // TODO: insert any other missing checks + + DP("New requires flags %" PRId64 " compatible with existing %" PRId64 "!\n", + flags, RequiresFlags); +} + +void OffloadingPlugin::RegisterLib(__tgt_bin_desc *desc) { + // Attempt to load all plugins available in the system. + std::call_once(initFlag, &OffloadingPlugin::LoadRTLs, this); + + RTLsMtx->lock(); + + // Register the images with the RTLs that understand them, if any. + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + // Obtain the image. + __tgt_device_image *img = &desc->DeviceImages[i]; + + RTLInfoTy *FoundRTL = NULL; + + // Scan the RTLs that have associated images until we find one that supports + // the current image. + for (auto &R : AllRTLs) { + if (!R.is_valid_binary(img)) { + DP("Image " DPxMOD " is NOT compatible with RTL %s!\n", + DPxPTR(img->ImageStart), R.RTLName.c_str()); + continue; + } + + DP("Image " DPxMOD " is compatible with RTL %s!\n", + DPxPTR(img->ImageStart), R.RTLName.c_str()); + + // If this RTL is not already in use, initialize it. + if (!R.isUsed) { + // Initialize the device information for the RTL we are about to use. + DeviceTy device(&R); + size_t start = Devices.size(); + Devices.resize(start + R.NumberOfDevices, device); + for (int32_t device_id = 0; device_id < R.NumberOfDevices; + device_id++) { + // global device ID + Devices[start + device_id].DeviceID = start + device_id; + // RTL local device ID + Devices[start + device_id].RTLDeviceID = device_id; + } + + // Initialize the index of this RTL and save it in the used RTLs. + R.Idx = (UsedRTLs.empty()) + ? 0 + : UsedRTLs.back()->Idx + UsedRTLs.back()->NumberOfDevices; + assert((size_t)R.Idx == start && + "RTL index should equal the number of devices used so far."); + R.isUsed = true; + UsedRTLs.push_back(&R); + + DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx); + } + + // Initialize (if necessary) translation table for this library. + TrlTblMtx->lock(); + if (!HostEntriesBeginToTransTable->count(desc->HostEntriesBegin)) { + TranslationTable &tt = + (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin]; + tt.HostTable.EntriesBegin = desc->HostEntriesBegin; + tt.HostTable.EntriesEnd = desc->HostEntriesEnd; + } + + // Retrieve translation table for this library. + TranslationTable &TransTable = + (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin]; + + DP("Registering image " DPxMOD " with RTL %s!\n", DPxPTR(img->ImageStart), + R.RTLName.c_str()); + RegisterImageIntoTranslationTable(TransTable, R, img); + TrlTblMtx->unlock(); + FoundRTL = &R; + + // Load ctors/dtors for static objects + RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL); + + // if an RTL was found we are done - proceed to register the next image + break; + } + + if (!FoundRTL) { + DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart)); + } + } + RTLsMtx->unlock(); + + DP("Done registering entries!\n"); +} + +void OffloadingPlugin::UnregisterLib(__tgt_bin_desc *desc) { + DP("Unloading target library!\n"); + + RTLsMtx->unlock(); + + // Find which RTL understands each image, if any. + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + // Obtain the image. + __tgt_device_image *img = &desc->DeviceImages[i]; + + RTLInfoTy *FoundRTL = NULL; + + // Scan the RTLs that have associated images until we find one that supports + // the current image. We only need to scan RTLs that are already being used. + for (auto *R : UsedRTLs) { + + assert(R->isUsed && "Expecting used RTLs."); + + if (!R->is_valid_binary(img)) { + DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + continue; + } + + DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + + FoundRTL = R; + + // Execute dtors for static objects if the device has been used, i.e. + // if its PendingCtors list has been emptied. + for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) { + DeviceTy &Device = Devices[FoundRTL->Idx + i]; + Device.PendingGlobalsMtx.lock(); + if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) { + for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { + int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, + NULL, 1, 1, true /*team*/, this); + if (rc != OFFLOAD_SUCCESS) { + DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor)); + } + } + // Remove this library's entry from PendingCtorsDtors + Device.PendingCtorsDtors.erase(desc); + } + Device.PendingGlobalsMtx.unlock(); + } + + DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + + break; + } + + // if no RTL was found proceed to unregister the next image + if (!FoundRTL) { + DP("No RTLs in use support the image " DPxMOD "!\n", + DPxPTR(img->ImageStart)); + } + } + + RTLsMtx->unlock(); + DP("Done unregistering images!\n"); + + // Remove entries from HostPtrToTableMap + TblMapMtx->lock(); + for (__tgt_offload_entry *cur = desc->HostEntriesBegin; + cur < desc->HostEntriesEnd; ++cur) { + HostPtrToTableMap->erase(cur->addr); + } + + // Remove translation table for this descriptor. + auto tt = HostEntriesBeginToTransTable->find(desc->HostEntriesBegin); + if (tt != HostEntriesBeginToTransTable->end()) { + DP("Removing translation table for descriptor " DPxMOD "\n", + DPxPTR(desc->HostEntriesBegin)); + HostEntriesBeginToTransTable->erase(tt); + } else { + DP("Translation table for descriptor " DPxMOD " cannot be found, probably " + "it has been already removed.\n", + DPxPTR(desc->HostEntriesBegin)); + } + + TblMapMtx->unlock(); + + // TODO: Remove RTL and the devices it manages if it's not used anymore? + // TODO: Write some RTL->unload_image(...) function? + + DP("Done unregistering library!\n"); +} + +// Check whether a device has been initialized, global ctors have been +// executed and global data has been mapped; do so if not already done. +int OffloadingPlugin::CheckDeviceAndCtors(int64_t device_id) { + // Is device ready? + if (!device_is_ready(device_id)) { + REPORT("Device %" PRId64 " is not ready.\n", device_id); + return OFFLOAD_FAIL; + } + + // Get device info. + DeviceTy &Device = Devices[device_id]; + + // Check whether global data has been mapped for this device + Device.PendingGlobalsMtx.lock(); + bool hasPendingGlobals = Device.HasPendingGlobals; + Device.PendingGlobalsMtx.unlock(); + if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { + REPORT("Failed to init globals on device %" PRId64 "\n", device_id); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + +/// Map global data and execute pending ctors +int OffloadingPlugin::InitLibrary(DeviceTy &Device) { + /* + * Map global data + */ + int32_t device_id = Device.DeviceID; + int rc = OFFLOAD_SUCCESS; + + Device.PendingGlobalsMtx.lock(); + TrlTblMtx->lock(); + for (HostEntriesBeginToTransTableTy::iterator ii = + HostEntriesBeginToTransTable->begin(); + ii != HostEntriesBeginToTransTable->end(); ++ii) { + TranslationTable *TransTable = &ii->second; + if (TransTable->HostTable.EntriesBegin == + TransTable->HostTable.EntriesEnd) { + // No host entry so no need to proceed + continue; + } + if (TransTable->TargetsTable[device_id] != 0) { + // Library entries have already been processed + continue; + } + + // 1) get image. + assert(TransTable->TargetsImages.size() > (size_t)device_id && + "Not expecting a device ID outside the table's bounds!"); + __tgt_device_image *img = TransTable->TargetsImages[device_id]; + if (!img) { + REPORT("No image loaded for device id %d.\n", device_id); + rc = OFFLOAD_FAIL; + break; + } + // 2) load image into the target table. + __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] = + Device.load_binary(img); + // Unable to get table for this image: invalidate image and fail. + if (!TargetTable) { + REPORT("Unable to generate entries table for device id %d.\n", device_id); + TransTable->TargetsImages[device_id] = 0; + rc = OFFLOAD_FAIL; + break; + } + + // Verify whether the two table sizes match. + size_t hsize = + TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; + size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; + + // Invalid image for these host entries! + if (hsize != tsize) { + REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n", + device_id, hsize, tsize); + TransTable->TargetsImages[device_id] = 0; + TransTable->TargetsTable[device_id] = 0; + rc = OFFLOAD_FAIL; + break; + } + + // process global data that needs to be mapped. + Device.DataMapMtx.lock(); + __tgt_target_table *HostTable = &TransTable->HostTable; + for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, + *CurrHostEntry = HostTable->EntriesBegin, + *EntryDeviceEnd = TargetTable->EntriesEnd; + CurrDeviceEntry != EntryDeviceEnd; + CurrDeviceEntry++, CurrHostEntry++) { + if (CurrDeviceEntry->size != 0) { + // has data. + assert(CurrDeviceEntry->size == CurrHostEntry->size && + "data size mismatch"); + + // Fortran may use multiple weak declarations for the same symbol, + // therefore we must allow for multiple weak symbols to be loaded from + // the fat binary. Treat these mappings as any other "regular" + // mapping. Add entry to map. + if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) + continue; + DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" + "\n", + DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), + CurrDeviceEntry->size); + Device.HostDataToTargetMap.emplace( + (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, + (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, + (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, + (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, + true /*IsRefCountINF*/); + } + } + Device.DataMapMtx.unlock(); + } + TrlTblMtx->unlock(); + + if (rc != OFFLOAD_SUCCESS) { + Device.PendingGlobalsMtx.unlock(); + return rc; + } + + /* + * Run ctors for static objects + */ + if (!Device.PendingCtorsDtors.empty()) { + // Call all ctors for all libraries registered so far + for (auto &lib : Device.PendingCtorsDtors) { + if (!lib.second.PendingCtors.empty()) { + DP("Has pending ctors... call now\n"); + for (auto &entry : lib.second.PendingCtors) { + void *ctor = entry; + int rc = target(device_id, ctor, 0, NULL, NULL, NULL, NULL, NULL, 1, + 1, true /*team*/, this); + if (rc != OFFLOAD_SUCCESS) { + REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); + Device.PendingGlobalsMtx.unlock(); + return OFFLOAD_FAIL; + } + } + // Clear the list to indicate that this device has been used + lib.second.PendingCtors.clear(); + DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); + } + } + } + Device.HasPendingGlobals = false; + Device.PendingGlobalsMtx.unlock(); + + return OFFLOAD_SUCCESS; +} + +//////////////////////////////////////////////////////////////////////////////// +// Functionality for registering libs + +void OffloadingPlugin::RegisterImageIntoTranslationTable( + TranslationTable &TT, RTLInfoTy &RTL, __tgt_device_image *image) { + + // same size, as when we increase one, we also increase the other. + assert(TT.TargetsTable.size() == TT.TargetsImages.size() && + "We should have as many images as we have tables!"); + + // Resize the Targets Table and Images to accommodate the new targets if + // required + unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices; + + if (TT.TargetsTable.size() < TargetsTableMinimumSize) { + TT.TargetsImages.resize(TargetsTableMinimumSize, 0); + TT.TargetsTable.resize(TargetsTableMinimumSize, 0); + } + + // Register the image in all devices for this target type. + for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) { + // If we are changing the image we are also invalidating the target table. + if (TT.TargetsImages[RTL.Idx + i] != image) { + TT.TargetsImages[RTL.Idx + i] = image; + TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table. + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Functionality for registering Ctors/Dtors + +void OffloadingPlugin::RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc, + __tgt_device_image *img, + RTLInfoTy *RTL) { + + for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) { + DeviceTy &Device = Devices[RTL->Idx + i]; + Device.PendingGlobalsMtx.lock(); + Device.HasPendingGlobals = true; + for (__tgt_offload_entry *entry = img->EntriesBegin; + entry != img->EntriesEnd; ++entry) { + if (entry->flags & OMP_DECLARE_TARGET_CTOR) { + DP("Adding ctor " DPxMOD " to the pending list.\n", + DPxPTR(entry->addr)); + Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr); + } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) { + // Dtors are pushed in reverse order so they are executed from end + // to beginning when unregistering the library! + DP("Adding dtor " DPxMOD " to the pending list.\n", + DPxPTR(entry->addr)); + Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr); + } + + if (entry->flags & OMP_DECLARE_TARGET_LINK) { + DP("The \"link\" attribute is not yet supported!\n"); + } + } + Device.PendingGlobalsMtx.unlock(); + } +} + +int OffloadingPlugin::omp_get_num_devices(void) { + RTLsMtx->lock(); + size_t Devices_size = Devices.size(); + RTLsMtx->unlock(); + + DP("Call to omp_get_num_devices returning %zd\n", Devices_size); + + return Devices_size; +} + +int OffloadingPlugin::omp_get_initial_device(void) { + DP("Call to omp_get_initial_device returning %d\n", host_device); + return host_device; +} + +void *OffloadingPlugin::omp_target_alloc(size_t size, int device_num) { + DP("Call to omp_target_alloc for device %d requesting %zu bytes\n", + device_num, size); + + if (size <= 0) { + DP("Call to omp_target_alloc with non-positive length\n"); + return NULL; + } + + void *rc = NULL; + + if (device_num == omp_get_initial_device()) { + rc = malloc(size); + DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc)); + return rc; + } + + if (!device_is_ready(device_num)) { + DP("omp_target_alloc returns NULL ptr\n"); + return NULL; + } + + rc = Devices[device_num].allocData(size); + DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc)); + return rc; +} + +void OffloadingPlugin::omp_target_free(void *device_ptr, int device_num) { + DP("Call to omp_target_free for device %d and address " DPxMOD "\n", + device_num, DPxPTR(device_ptr)); + + if (!device_ptr) { + DP("Call to omp_target_free with NULL ptr\n"); + return; + } + + if (device_num == omp_get_initial_device()) { + free(device_ptr); + DP("omp_target_free deallocated host ptr\n"); + return; + } + + if (!device_is_ready(device_num)) { + DP("omp_target_free returns, nothing to do\n"); + return; + } + + Devices[device_num].deleteData(device_ptr); + DP("omp_target_free deallocated device ptr\n"); +} + +int OffloadingPlugin::omp_target_is_present(void *ptr, int device_num) { + DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n", + device_num, DPxPTR(ptr)); + + if (!ptr) { + DP("Call to omp_target_is_present with NULL ptr, returning false\n"); + return false; + } + + if (device_num == omp_get_initial_device()) { + DP("Call to omp_target_is_present on host, returning true\n"); + return true; + } + + RTLsMtx->lock(); + size_t Devices_size = Devices.size(); + RTLsMtx->unlock(); + if (Devices_size <= (size_t)device_num) { + DP("Call to omp_target_is_present with invalid device ID, returning " + "false\n"); + return false; + } + + DeviceTy &Device = Devices[device_num]; + bool IsLast; // not used + bool IsHostPtr; + void *TgtPtr = + Device.getTgtPtrBegin(ptr, 0, IsLast, false, IsHostPtr, RequiresFlags); + int rc = (TgtPtr != NULL); + // Under unified memory the host pointer can be returned by the + // getTgtPtrBegin() function which means that there is no device + // corresponding point for ptr. This function should return false + // in that situation. + if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) + rc = !IsHostPtr; + DP("Call to omp_target_is_present returns %d\n", rc); + return rc; +} + +int OffloadingPlugin::omp_target_memcpy(void *dst, void *src, size_t length, + size_t dst_offset, size_t src_offset, + int dst_device, int src_device) { + DP("Call to omp_target_memcpy, dst device %d, src device %d, " + "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " + "src offset %zu, length %zu\n", + dst_device, src_device, DPxPTR(dst), DPxPTR(src), dst_offset, src_offset, + length); + + if (!dst || !src || length <= 0) { + REPORT("Call to omp_target_memcpy with invalid arguments\n"); + return OFFLOAD_FAIL; + } + + if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) { + REPORT("omp_target_memcpy returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) { + REPORT("omp_target_memcpy returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + int rc = OFFLOAD_SUCCESS; + void *srcAddr = (char *)src + src_offset; + void *dstAddr = (char *)dst + dst_offset; + + if (src_device == omp_get_initial_device() && + dst_device == omp_get_initial_device()) { + DP("copy from host to host\n"); + const void *p = memcpy(dstAddr, srcAddr, length); + if (p == NULL) + rc = OFFLOAD_FAIL; + } else if (src_device == omp_get_initial_device()) { + DP("copy from host to device\n"); + DeviceTy &DstDev = Devices[dst_device]; + rc = DstDev.submitData(dstAddr, srcAddr, length, nullptr); + } else if (dst_device == omp_get_initial_device()) { + DP("copy from device to host\n"); + DeviceTy &SrcDev = Devices[src_device]; + rc = SrcDev.retrieveData(dstAddr, srcAddr, length, nullptr); + } else { + DP("copy from device to device\n"); + DeviceTy &SrcDev = Devices[src_device]; + DeviceTy &DstDev = Devices[dst_device]; + // First try to use D2D memcpy which is more efficient. If fails, fall back + // to unefficient way. + if (SrcDev.isDataExchangable(DstDev)) { + rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, nullptr); + if (rc == OFFLOAD_SUCCESS) + return OFFLOAD_SUCCESS; + } + + void *buffer = malloc(length); + rc = SrcDev.retrieveData(buffer, srcAddr, length, nullptr); + if (rc == OFFLOAD_SUCCESS) + rc = DstDev.submitData(dstAddr, buffer, length, nullptr); + free(buffer); + } + + DP("omp_target_memcpy returns %d\n", rc); + return rc; +} + +int OffloadingPlugin::omp_target_memcpy_rect( + void *dst, void *src, size_t element_size, int num_dims, + const size_t *volume, const size_t *dst_offsets, const size_t *src_offsets, + const size_t *dst_dimensions, const size_t *src_dimensions, int dst_device, + int src_device) { + DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, " + "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", " + "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " + "volume " DPxMOD ", element size %zu, num_dims %d\n", + dst_device, src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets), + DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions), + DPxPTR(volume), element_size, num_dims); + + if (!(dst || src)) { + DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n", + INT_MAX); + return INT_MAX; + } + + if (!dst || !src || element_size < 1 || num_dims < 1 || !volume || + !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) { + REPORT("Call to omp_target_memcpy_rect with invalid arguments\n"); + return OFFLOAD_FAIL; + } + + int rc; + if (num_dims == 1) { + rc = omp_target_memcpy( + dst, src, element_size * volume[0], element_size * dst_offsets[0], + element_size * src_offsets[0], dst_device, src_device); + } else { + size_t dst_slice_size = element_size; + size_t src_slice_size = element_size; + for (int i = 1; i < num_dims; ++i) { + dst_slice_size *= dst_dimensions[i]; + src_slice_size *= src_dimensions[i]; + } + + size_t dst_off = dst_offsets[0] * dst_slice_size; + size_t src_off = src_offsets[0] * src_slice_size; + for (size_t i = 0; i < volume[0]; ++i) { + rc = omp_target_memcpy_rect( + (char *)dst + dst_off + dst_slice_size * i, + (char *)src + src_off + src_slice_size * i, element_size, + num_dims - 1, volume + 1, dst_offsets + 1, src_offsets + 1, + dst_dimensions + 1, src_dimensions + 1, dst_device, src_device); + + if (rc) { + DP("Recursive call to omp_target_memcpy_rect returns unsuccessfully\n"); + return rc; + } + } + } + + DP("omp_target_memcpy_rect returns %d\n", rc); + return rc; +} + +int OffloadingPlugin::omp_target_associate_ptr(void *host_ptr, void *device_ptr, + size_t size, + size_t device_offset, + int device_num) { + DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", " + "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n", + DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num); + + if (!host_ptr || !device_ptr || size <= 0) { + REPORT("Call to omp_target_associate_ptr with invalid arguments\n"); + return OFFLOAD_FAIL; + } + + if (device_num == omp_get_initial_device()) { + REPORT("omp_target_associate_ptr: no association possible on the host\n"); + return OFFLOAD_FAIL; + } + + if (!device_is_ready(device_num)) { + REPORT("omp_target_associate_ptr returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + DeviceTy &Device = Devices[device_num]; + void *device_addr = (void *)((uint64_t)device_ptr + (uint64_t)device_offset); + int rc = Device.associatePtr(host_ptr, device_addr, size); + DP("omp_target_associate_ptr returns %d\n", rc); + return rc; +} + +int OffloadingPlugin::omp_target_disassociate_ptr(void *host_ptr, + int device_num) { + DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", " + "device_num %d\n", + DPxPTR(host_ptr), device_num); + + if (!host_ptr) { + REPORT("Call to omp_target_associate_ptr with invalid host_ptr\n"); + return OFFLOAD_FAIL; + } + + if (device_num == omp_get_initial_device()) { + REPORT( + "omp_target_disassociate_ptr: no association possible on the host\n"); + return OFFLOAD_FAIL; + } + + if (!device_is_ready(device_num)) { + REPORT("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + DeviceTy &Device = Devices[device_num]; + int rc = Device.disassociatePtr(host_ptr); + DP("omp_target_disassociate_ptr returns %d\n", rc); + return rc; +} diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -11,9 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "device.h" -#include "private.h" -#include "rtl.h" +#include "offloading.h" #include #include @@ -50,158 +48,6 @@ */ static const int64_t Alignment = 8; -/// Map global data and execute pending ctors -static int InitLibrary(DeviceTy& Device) { - /* - * Map global data - */ - int32_t device_id = Device.DeviceID; - int rc = OFFLOAD_SUCCESS; - - Device.PendingGlobalsMtx.lock(); - TrlTblMtx->lock(); - for (HostEntriesBeginToTransTableTy::iterator - ii = HostEntriesBeginToTransTable->begin(); - ii != HostEntriesBeginToTransTable->end(); ++ii) { - TranslationTable *TransTable = &ii->second; - if (TransTable->HostTable.EntriesBegin == - TransTable->HostTable.EntriesEnd) { - // No host entry so no need to proceed - continue; - } - if (TransTable->TargetsTable[device_id] != 0) { - // Library entries have already been processed - continue; - } - - // 1) get image. - assert(TransTable->TargetsImages.size() > (size_t)device_id && - "Not expecting a device ID outside the table's bounds!"); - __tgt_device_image *img = TransTable->TargetsImages[device_id]; - if (!img) { - REPORT("No image loaded for device id %d.\n", device_id); - rc = OFFLOAD_FAIL; - break; - } - // 2) load image into the target table. - __tgt_target_table *TargetTable = - TransTable->TargetsTable[device_id] = Device.load_binary(img); - // Unable to get table for this image: invalidate image and fail. - if (!TargetTable) { - REPORT("Unable to generate entries table for device id %d.\n", device_id); - TransTable->TargetsImages[device_id] = 0; - rc = OFFLOAD_FAIL; - break; - } - - // Verify whether the two table sizes match. - size_t hsize = - TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; - size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; - - // Invalid image for these host entries! - if (hsize != tsize) { - REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n", - device_id, hsize, tsize); - TransTable->TargetsImages[device_id] = 0; - TransTable->TargetsTable[device_id] = 0; - rc = OFFLOAD_FAIL; - break; - } - - // process global data that needs to be mapped. - Device.DataMapMtx.lock(); - __tgt_target_table *HostTable = &TransTable->HostTable; - for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, - *CurrHostEntry = HostTable->EntriesBegin, - *EntryDeviceEnd = TargetTable->EntriesEnd; - CurrDeviceEntry != EntryDeviceEnd; - CurrDeviceEntry++, CurrHostEntry++) { - if (CurrDeviceEntry->size != 0) { - // has data. - assert(CurrDeviceEntry->size == CurrHostEntry->size && - "data size mismatch"); - - // Fortran may use multiple weak declarations for the same symbol, - // therefore we must allow for multiple weak symbols to be loaded from - // the fat binary. Treat these mappings as any other "regular" mapping. - // Add entry to map. - if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) - continue; - DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" - "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), - CurrDeviceEntry->size); - Device.HostDataToTargetMap.emplace( - (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, - (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, - (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, - (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, - true /*IsRefCountINF*/); - } - } - Device.DataMapMtx.unlock(); - } - TrlTblMtx->unlock(); - - if (rc != OFFLOAD_SUCCESS) { - Device.PendingGlobalsMtx.unlock(); - return rc; - } - - /* - * Run ctors for static objects - */ - if (!Device.PendingCtorsDtors.empty()) { - // Call all ctors for all libraries registered so far - for (auto &lib : Device.PendingCtorsDtors) { - if (!lib.second.PendingCtors.empty()) { - DP("Has pending ctors... call now\n"); - for (auto &entry : lib.second.PendingCtors) { - void *ctor = entry; - int rc = target(device_id, ctor, 0, NULL, NULL, NULL, NULL, NULL, 1, - 1, true /*team*/); - if (rc != OFFLOAD_SUCCESS) { - REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); - Device.PendingGlobalsMtx.unlock(); - return OFFLOAD_FAIL; - } - } - // Clear the list to indicate that this device has been used - lib.second.PendingCtors.clear(); - DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); - } - } - } - Device.HasPendingGlobals = false; - Device.PendingGlobalsMtx.unlock(); - - return OFFLOAD_SUCCESS; -} - -// Check whether a device has been initialized, global ctors have been -// executed and global data has been mapped; do so if not already done. -int CheckDeviceAndCtors(int64_t device_id) { - // Is device ready? - if (!device_is_ready(device_id)) { - REPORT("Device %" PRId64 " is not ready.\n", device_id); - return OFFLOAD_FAIL; - } - - // Get device info. - DeviceTy &Device = Devices[device_id]; - - // Check whether global data has been mapped for this device - Device.PendingGlobalsMtx.lock(); - bool hasPendingGlobals = Device.HasPendingGlobals; - Device.PendingGlobalsMtx.unlock(); - if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { - REPORT("Failed to init globals on device %" PRId64 "\n", device_id); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; -} - static int32_t getParentIndex(int64_t type) { return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; } @@ -210,14 +56,15 @@ // target_data_* function (target_data_{begin,end,update}). int targetDataMapper(DeviceTy &Device, void *arg_base, void *arg, int64_t arg_size, int64_t arg_type, void *arg_mapper, - TargetDataFuncPtrTy target_data_function) { + TargetDataFuncPtrTy target_data_function, + OffloadingPlugin *OP) { DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper)); // The mapper function fills up Components. MapperComponentsTy MapperComponents; MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper); (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, - arg_type); + arg_type); // Construct new arrays for args_base, args, arg_sizes and arg_types // using the information in MapperComponents and call the corresponding @@ -240,7 +87,7 @@ int rc = target_data_function(Device, MapperComponents.Components.size(), MapperArgsBase.data(), MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(), - /*arg_mappers*/ nullptr, + /*arg_mappers*/ nullptr, OP, /*__tgt_async_info*/ nullptr); return rc; @@ -249,7 +96,8 @@ /// Internal function to do the mapping and transfer the data to the device int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - void **arg_mappers, __tgt_async_info *async_info_ptr) { + void **arg_mappers, OffloadingPlugin *OP, + __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { // Ignore private variables and arrays - there is no mapping for them. @@ -263,8 +111,9 @@ // with new arguments. DP("Calling targetDataMapper for the %dth argument\n", i); - int rc = targetDataMapper(Device, args_base[i], args[i], arg_sizes[i], - arg_types[i], arg_mappers[i], targetDataBegin); + int rc = + targetDataMapper(Device, args_base[i], args[i], arg_sizes[i], + arg_types[i], arg_mappers[i], targetDataBegin, OP); if (rc != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" @@ -281,17 +130,18 @@ int64_t data_size = arg_sizes[i]; // Adjust for proper alignment if this is a combined entry (for structs). - // Look at the next argument - if that is MEMBER_OF this one, then this one - // is a combined entry. + // Look at the next argument - if that is MEMBER_OF this one, then this + // one is a combined entry. int64_t padding = 0; - const int next_i = i+1; + const int next_i = i + 1; if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num && getParentIndex(arg_types[next_i]) == i) { padding = (int64_t)HstPtrBegin % Alignment; if (padding) { DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD - "\n", padding, DPxPTR(HstPtrBegin)); - HstPtrBegin = (char *) HstPtrBegin - padding; + "\n", + padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *)HstPtrBegin - padding; data_size += padding; } } @@ -306,10 +156,11 @@ bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT; // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we - // have reached this point via __tgt_target_data_begin and not __tgt_target - // then no argument is marked as TARGET_PARAM ("omp target data map" is not - // associated with a target region, so there are no target parameters). This - // may be considered a hack, we could revise the scheme in the future. + // have reached this point via __tgt_target_data_begin and not + // __tgt_target then no argument is marked as TARGET_PARAM ("omp target + // data map" is not associated with a target region, so there are no + // target parameters). This may be considered a hack, we could revise the + // scheme in the future. bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF); if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { DP("Has a pointer entry: \n"); @@ -323,12 +174,13 @@ // afterward, so the pointer is already allocated by the time the // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus // non-null. However, "declare target link" can produce a PTR_AND_OBJ - // entry for a global that might not already be allocated by the time the - // PTR_AND_OBJ entry is handled below, and so the allocation might fail - // when HasPresentModifier. + // entry for a global that might not already be allocated by the time + // the PTR_AND_OBJ entry is handled below, and so the allocation might + // fail when HasPresentModifier. PointerTgtPtrBegin = Device.getOrAllocTgtPtr( HstPtrBase, HstPtrBase, sizeof(void *), Pointer_IsNew, IsHostPtr, - IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier); + IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier, + OP->RequiresFlags); if (!PointerTgtPtrBegin) { REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" @@ -336,8 +188,9 @@ return OFFLOAD_FAIL; } DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" - "\n", sizeof(void *), DPxPTR(PointerTgtPtrBegin), - (Pointer_IsNew ? "" : " not")); + "\n", + sizeof(void *), DPxPTR(PointerTgtPtrBegin), + (Pointer_IsNew ? "" : " not")); Pointer_HstPtrBegin = HstPtrBase; // modify current entry. HstPtrBase = *(void **)HstPtrBase; @@ -346,7 +199,7 @@ void *TgtPtrBegin = Device.getOrAllocTgtPtr( HstPtrBegin, HstPtrBase, data_size, IsNew, IsHostPtr, IsImplicit, - UpdateRef, HasCloseModifier, HasPresentModifier); + UpdateRef, HasCloseModifier, HasPresentModifier, OP->RequiresFlags); // If data_size==0, then the argument could be a zero-length pointer to // NULL, so getOrAlloc() returning NULL is not an error. if (!TgtPtrBegin && (data_size || HasPresentModifier)) { @@ -356,8 +209,8 @@ return OFFLOAD_FAIL; } DP("There are %" PRId64 " bytes allocated at target address " DPxMOD - " - is%s new\n", data_size, DPxPTR(TgtPtrBegin), - (IsNew ? "" : " not")); + " - is%s new\n", + data_size, DPxPTR(TgtPtrBegin), (IsNew ? "" : " not")); if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; @@ -368,7 +221,7 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { bool copy = false; - if (!(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || + if (!(OP->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || HasCloseModifier) { if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) { copy = true; @@ -437,10 +290,12 @@ }; } // namespace -/// Internal function to undo the mapping and retrieve the data from the device. +/// Internal function to undo the mapping and retrieve the data from the +/// device. int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, - void **ArgMappers, __tgt_async_info *AsyncInfo) { + void **ArgMappers, OffloadingPlugin *OP, + __tgt_async_info *AsyncInfo) { int Ret; std::vector DeallocTgtPtrs; // process each input. @@ -458,7 +313,7 @@ DP("Calling targetDataMapper for the %dth argument\n", I); Ret = targetDataMapper(Device, ArgBases[I], Args[I], ArgSizes[I], - ArgTypes[I], ArgMappers[I], targetDataEnd); + ArgTypes[I], ArgMappers[I], targetDataEnd, OP); if (Ret != OFFLOAD_SUCCESS) { REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" @@ -473,8 +328,8 @@ void *HstPtrBegin = Args[I]; int64_t DataSize = ArgSizes[I]; // Adjust for proper alignment if this is a combined entry (for structs). - // Look at the next argument - if that is MEMBER_OF this one, then this one - // is a combined entry. + // Look at the next argument - if that is MEMBER_OF this one, then this + // one is a combined entry. const int NextI = I + 1; if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && getParentIndex(ArgTypes[NextI]) == I) { @@ -497,20 +352,21 @@ bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; // If PTR_AND_OBJ, HstPtrBegin is address of pointee - void *TgtPtrBegin = Device.getTgtPtrBegin( - HstPtrBegin, DataSize, IsLast, UpdateRef, IsHostPtr, !IsImplicit); + void *TgtPtrBegin = + Device.getTgtPtrBegin(HstPtrBegin, DataSize, IsLast, UpdateRef, + IsHostPtr, OP->RequiresFlags, !IsImplicit); if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { DP("Mapping does not exist (%s)\n", (HasPresentModifier ? "'present' map type modifier" : "ignored")); if (HasPresentModifier) { - // This should be an error upon entering an "omp target exit data". It + // This should be an error upon entering an "omp target exit data". It // should not be an error upon exiting an "omp target data" or "omp // target". For "omp target data", Clang thus doesn't include present - // modifiers for end calls. For "omp target", we have not found a valid - // OpenMP program for which the error matters: it appears that, if a - // program can guarantee that data is present at the beginning of an - // "omp target" region so that there's no error there, that data is also - // guaranteed to be present at the end. + // modifiers for end calls. For "omp target", we have not found a + // valid OpenMP program for which the error matters: it appears that, + // if a program can guarantee that data is present at the beginning of + // an "omp target" region so that there's no error there, that data is + // also guaranteed to be present at the end. MESSAGE("device mapping required by 'present' map type modifier does " "not exist for host address " DPxMOD " (%" PRId64 " bytes)", DPxPTR(HstPtrBegin), DataSize); @@ -534,7 +390,7 @@ if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; bool CopyMember = false; - if (!(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || + if (!(OP->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || HasCloseModifier) { if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { @@ -548,7 +404,7 @@ } if ((DelEntry || Always || CopyMember) && - !(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + !(OP->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && TgtPtrBegin == HstPtrBegin)) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); @@ -563,8 +419,8 @@ // If we copied back to the host a struct/array containing pointers, we // need to restore the original host pointer values from their shadow - // copies. If the struct is going to be deallocated, remove any remaining - // shadow pointer entries for this struct. + // copies. If the struct is going to be deallocated, remove any + // remaining shadow pointer entries for this struct. uintptr_t LB = (uintptr_t)HstPtrBegin; uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize; Device.ShadowMtx.lock(); @@ -581,7 +437,8 @@ if ((uintptr_t)ShadowHstPtrAddr >= UB) break; - // If we copied the struct to the host, we need to restore the pointer. + // If we copied the struct to the host, we need to restore the + // pointer. if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { DP("Restoring original host pointer value " DPxMOD " for host " "pointer " DPxMOD "\n", @@ -607,8 +464,8 @@ // We need to synchronize before deallocating data. // If AsyncInfo is nullptr, the previous data transfer (if has) will be - // synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is - // nullptr, there is no data transfer happened because once there is, + // synchronous, so we don't need to synchronize again. If AsyncInfo->Queue + // is nullptr, there is no data transfer happened because once there is, // AsyncInfo->Queue will not be nullptr, so again, we don't need to // synchronize. if (AsyncInfo && AsyncInfo->Queue) { @@ -633,11 +490,12 @@ } /// Internal function to pass data to/from the target. -// async_info_ptr is currently unused, added here so target_data_update has the -// same signature as targetDataBegin and targetDataEnd. -int target_data_update(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - void **arg_mappers, __tgt_async_info *async_info_ptr) { +// async_info_ptr is currently unused, added here so target_data_update has +// the same signature as targetDataBegin and targetDataEnd. +int target_data_update(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers, OffloadingPlugin *OP, + __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || @@ -650,14 +508,14 @@ // with new arguments. DP("Calling targetDataMapper for the %dth argument\n", i); - int rc = - targetDataMapper(Device, args_base[i], args[i], arg_sizes[i], - arg_types[i], arg_mappers[i], target_data_update); + int rc = targetDataMapper(Device, args_base[i], args[i], arg_sizes[i], + arg_types[i], arg_mappers[i], + target_data_update, OP); if (rc != OFFLOAD_SUCCESS) { - REPORT( - "Call to target_data_update via targetDataMapper for custom mapper" - " failed.\n"); + REPORT("Call to target_data_update via targetDataMapper for custom " + "mapper" + " failed.\n"); return OFFLOAD_FAIL; } @@ -668,10 +526,12 @@ void *HstPtrBegin = args[i]; int64_t MapSize = arg_sizes[i]; bool IsLast, IsHostPtr; - void *TgtPtrBegin = Device.getTgtPtrBegin( - HstPtrBegin, MapSize, IsLast, false, IsHostPtr, /*MustContain=*/true); + void *TgtPtrBegin = + Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast, false, IsHostPtr, + OP->RequiresFlags, /*MustContain=*/true); if (!TgtPtrBegin) { - DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); + DP("hst data:" DPxMOD " not found, becomes a noop\n", + DPxPTR(HstPtrBegin)); if (arg_types[i] & OMP_TGT_MAPTYPE_PRESENT) { MESSAGE("device mapping required by 'present' motion modifier does not " "exist for host address " DPxMOD " (%" PRId64 " bytes)", @@ -681,7 +541,7 @@ continue; } - if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + if (OP->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && TgtPtrBegin == HstPtrBegin) { DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", DPxPTR(HstPtrBegin)); @@ -690,26 +550,26 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); int rt = Device.retrieveData(HstPtrBegin, TgtPtrBegin, MapSize, nullptr); if (rt != OFFLOAD_SUCCESS) { REPORT("Copying data from device failed.\n"); return OFFLOAD_FAIL; } - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + uintptr_t lb = (uintptr_t)HstPtrBegin; + uintptr_t ub = (uintptr_t)HstPtrBegin + MapSize; Device.ShadowMtx.lock(); for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); - it != Device.ShadowPtrMap.end(); ++it) { - void **ShadowHstPtrAddr = (void**) it->first; - if ((uintptr_t) ShadowHstPtrAddr < lb) + it != Device.ShadowPtrMap.end(); ++it) { + void **ShadowHstPtrAddr = (void **)it->first; + if ((uintptr_t)ShadowHstPtrAddr < lb) continue; - if ((uintptr_t) ShadowHstPtrAddr >= ub) + if ((uintptr_t)ShadowHstPtrAddr >= ub) break; - DP("Restoring original host pointer value " DPxMOD " for host pointer " - DPxMOD "\n", DPxPTR(it->second.HstPtrVal), - DPxPTR(ShadowHstPtrAddr)); + DP("Restoring original host pointer value " DPxMOD + " for host pointer " DPxMOD "\n", + DPxPTR(it->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); *ShadowHstPtrAddr = it->second.HstPtrVal; } Device.ShadowMtx.unlock(); @@ -717,18 +577,18 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); int rt = Device.submitData(TgtPtrBegin, HstPtrBegin, MapSize, nullptr); if (rt != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; } - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + uintptr_t lb = (uintptr_t)HstPtrBegin; + uintptr_t ub = (uintptr_t)HstPtrBegin + MapSize; Device.ShadowMtx.lock(); for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); - it != Device.ShadowPtrMap.end(); ++it) { + it != Device.ShadowPtrMap.end(); ++it) { void **ShadowHstPtrAddr = (void **)it->first; if ((uintptr_t)ShadowHstPtrAddr < lb) continue; @@ -757,23 +617,23 @@ static bool isLambdaMapping(int64_t Mapping) { return (Mapping & LambdaMapping) == LambdaMapping; } - namespace { /// Find the table information in the map or look it up in the translation /// tables. -TableMap *getTableMap(void *HostPtr) { - std::lock_guard TblMapLock(*TblMapMtx); - HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap->find(HostPtr); +TableMap *getTableMap(void *HostPtr, OffloadingPlugin *OP) { + std::lock_guard TblMapLock(*OP->TblMapMtx); + HostPtrToTableMapTy::iterator TableMapIt = + OP->HostPtrToTableMap->find(HostPtr); - if (TableMapIt != HostPtrToTableMap->end()) + if (TableMapIt != OP->HostPtrToTableMap->end()) return &TableMapIt->second; // We don't have a map. So search all the registered libraries. TableMap *TM = nullptr; - std::lock_guard TrlTblLock(*TrlTblMtx); + std::lock_guard TrlTblLock(*OP->TrlTblMtx); for (HostEntriesBeginToTransTableTy::iterator Itr = - HostEntriesBeginToTransTable->begin(); - Itr != HostEntriesBeginToTransTable->end(); ++Itr) { + OP->HostEntriesBeginToTransTable->begin(); + Itr != OP->HostEntriesBeginToTransTable->end(); ++Itr) { // get the translation table (which contains all the good info). TranslationTable *TransTable = &Itr->second; // iterate over all the host table entries to see if we can locate the @@ -784,7 +644,7 @@ continue; // we got a match, now fill the HostPtrToTableMap so that we // may avoid this search next time. - TM = &(*HostPtrToTableMap)[HostPtr]; + TM = &(*OP->HostPtrToTableMap)[HostPtr]; TM->Table = TransTable; TM->Index = I; return TM; @@ -798,12 +658,12 @@ /// FIXME: This function will not work right if calling /// __kmpc_push_target_tripcount in one thread but doing offloading in another /// thread, which might occur when we call task yield. -uint64_t getLoopTripCount(int64_t DeviceId) { - DeviceTy &Device = Devices[DeviceId]; +uint64_t getLoopTripCount(int64_t DeviceId, OffloadingPlugin *OP) { + DeviceTy &Device = OP->Devices[DeviceId]; uint64_t LoopTripCount = 0; { - std::lock_guard TblMapLock(*TblMapMtx); + std::lock_guard TblMapLock(*OP->TblMapMtx); auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); if (I != Device.LoopTripCnt.end()) { LoopTripCount = I->second; @@ -851,8 +711,8 @@ __tgt_async_info *AsyncInfo; // TODO: What would be the best value here? Should we make it configurable? - // If the size is larger than this threshold, we will allocate and transfer it - // immediately instead of packing it. + // If the size is larger than this threshold, we will allocate and transfer + // it immediately instead of packing it. static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; public: @@ -913,8 +773,8 @@ return OFFLOAD_SUCCESS; } - /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, - /// and start the transfer. + /// Pack first-private arguments, replace place holder pointers in \p + /// TgtArgs, and start the transfer. int packAndTransfer(std::vector &TgtArgs) { if (!FirstPrivateArgInfo.empty()) { assert(FirstPrivateArgSize != 0 && @@ -976,19 +836,19 @@ } }; -/// Process data before launching the kernel, including calling targetDataBegin -/// to map and transfer data to target device, transferring (first-)private -/// variables. +/// Process data before launching the kernel, including calling +/// targetDataBegin to map and transfer data to target device, transferring +/// (first-)private variables. int processDataBefore(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, void **ArgMappers, std::vector &TgtArgs, std::vector &TgtOffsets, PrivateArgumentManagerTy &PrivateArgumentManager, - __tgt_async_info *AsyncInfo) { - DeviceTy &Device = Devices[DeviceId]; + __tgt_async_info *AsyncInfo, OffloadingPlugin *OP) { + DeviceTy &Device = OP->Devices[DeviceId]; int Ret = targetDataBegin(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, - ArgMappers, AsyncInfo); + ArgMappers, OP, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin failed, abort target.\n"); return OFFLOAD_FAIL; @@ -1018,14 +878,15 @@ DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); - void *PointerTgtPtrBegin = Device.getTgtPtrBegin( - HstPtrVal, ArgSizes[I], IsLast, false, IsHostPtr); + void *PointerTgtPtrBegin = + Device.getTgtPtrBegin(HstPtrVal, ArgSizes[I], IsLast, false, + IsHostPtr, OP->RequiresFlags); if (!PointerTgtPtrBegin) { DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", DPxPTR(HstPtrVal)); continue; } - if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + if (OP->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && TgtPtrBegin == HstPtrBegin) { DP("Unified memory is active, no need to map lambda captured" "variable (" DPxMOD ")\n", @@ -1071,7 +932,7 @@ if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) HstPtrBase = *reinterpret_cast(HstPtrBase); TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast, - false, IsHostPtr); + false, IsHostPtr, OP->RequiresFlags); TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; #ifdef OMPTARGET_DEBUG void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); @@ -1097,18 +958,19 @@ return OFFLOAD_SUCCESS; } -/// Process data after launching the kernel, including transferring data back to -/// host if needed and deallocating target memory of (first-)private variables. +/// Process data after launching the kernel, including transferring data back +/// to host if needed and deallocating target memory of (first-)private +/// variables. int processDataAfter(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, void **ArgMappers, PrivateArgumentManagerTy &PrivateArgumentManager, - __tgt_async_info *AsyncInfo) { - DeviceTy &Device = Devices[DeviceId]; + __tgt_async_info *AsyncInfo, OffloadingPlugin *OP) { + DeviceTy &Device = OP->Devices[DeviceId]; // Move data from device. int Ret = targetDataEnd(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, - ArgMappers, AsyncInfo); + ArgMappers, OP, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Call to targetDataEnd failed, abort target.\n"); return OFFLOAD_FAIL; @@ -1133,10 +995,11 @@ /// integer different from zero otherwise. int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, void **ArgMappers, - int32_t TeamNum, int32_t ThreadLimit, int IsTeamConstruct) { - DeviceTy &Device = Devices[DeviceId]; + int32_t TeamNum, int32_t ThreadLimit, int IsTeamConstruct, + OffloadingPlugin *OP) { + DeviceTy &Device = OP->Devices[DeviceId]; - TableMap *TM = getTableMap(HostPtr); + TableMap *TM = getTableMap(HostPtr, OP); // No map for this host pointer found! if (!TM) { REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n", @@ -1147,7 +1010,7 @@ // get target table. __tgt_target_table *TargetTable = nullptr; { - std::lock_guard TrlTblLock(*TrlTblMtx); + std::lock_guard TrlTblLock(*OP->TrlTblMtx); assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!"); TargetTable = TM->Table->TargetsTable[DeviceId]; @@ -1162,16 +1025,16 @@ PrivateArgumentManagerTy PrivateArgumentManager(Device, &AsyncInfo); // Process data, such as data mapping, before launching the kernel - int Ret = processDataBefore(DeviceId, HostPtr, ArgNum, ArgBases, Args, - ArgSizes, ArgTypes, ArgMappers, TgtArgs, - TgtOffsets, PrivateArgumentManager, &AsyncInfo); + int Ret = processDataBefore( + DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, ArgMappers, + TgtArgs, TgtOffsets, PrivateArgumentManager, &AsyncInfo, OP); if (Ret != OFFLOAD_SUCCESS) { REPORT("Failed to process data before launching the kernel.\n"); return OFFLOAD_FAIL; } // Get loop trip count - uint64_t LoopTripCount = getLoopTripCount(DeviceId); + uint64_t LoopTripCount = getLoopTripCount(DeviceId, OP); // Launch device execution. void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; @@ -1195,7 +1058,7 @@ // variables Ret = processDataAfter(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, ArgMappers, PrivateArgumentManager, - &AsyncInfo); + &AsyncInfo, OP); if (Ret != OFFLOAD_SUCCESS) { REPORT("Failed to process data after launching the kernel.\n"); return OFFLOAD_FAIL; diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -18,28 +18,6 @@ #include -extern int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - void **arg_mappers, - __tgt_async_info *async_info_ptr); - -extern int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases, - void **Args, int64_t *ArgSizes, int64_t *ArgTypes, - void **ArgMappers, __tgt_async_info *AsyncInfo); - -extern int target_data_update(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, - int64_t *arg_sizes, int64_t *arg_types, - void **arg_mappers, - __tgt_async_info *async_info_ptr = nullptr); - -extern int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, - void **ArgBases, void **Args, int64_t *ArgSizes, - int64_t *ArgTypes, void **ArgMappers, int32_t TeamNum, - int32_t ThreadLimit, int IsTeamConstruct); - -extern int CheckDeviceAndCtors(int64_t device_id); - // enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition enum kmp_target_offload_kind { tgt_disabled = 0, @@ -74,11 +52,13 @@ // size_t size, int64_t type); typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t); +class OffloadingPlugin; + // Function pointer type for target_data_* functions (targetDataBegin, // targetDataEnd and target_data_update). typedef int (*TargetDataFuncPtrTy)(DeviceTy &, int32_t, void **, void **, int64_t *, int64_t *, void **, - __tgt_async_info *); + OffloadingPlugin *OP, __tgt_async_info *); // Implemented in libomp, they are called from within __tgt_* functions. #ifdef __cplusplus diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -20,10 +20,6 @@ #include #include -// Forward declarations. -struct DeviceTy; -struct __tgt_bin_desc; - struct RTLInfoTy { typedef int32_t(is_valid_binary_ty)(void *); typedef int32_t(is_data_exchangable_ty)(int32_t, int32_t); @@ -129,39 +125,6 @@ } }; -/// RTLs identified in the system. -class RTLsTy { -private: - // Mutex-like object to guarantee thread-safety and unique initialization - // (i.e. the library attempts to load the RTLs (plugins) only once). - std::once_flag initFlag; - void LoadRTLs(); // not thread-safe - -public: - // List of the detected runtime libraries. - std::list AllRTLs; - - // Array of pointers to the detected runtime libraries that have compatible - // binaries. - std::vector UsedRTLs; - - int64_t RequiresFlags = OMP_REQ_UNDEFINED; - - explicit RTLsTy() = default; - - // Register the clauses of the requires directive. - void RegisterRequires(int64_t flags); - - // Register a shared library with all (compatible) RTLs. - void RegisterLib(__tgt_bin_desc *desc); - - // Unregister a shared library from all RTLs. - void UnregisterLib(__tgt_bin_desc *desc); -}; -extern RTLsTy *RTLs; -extern std::mutex *RTLsMtx; - - /// Map between the host entry begin and the translation table. Each /// registered library gets one TranslationTable. Use the map from /// __tgt_offload_entry so that we may quickly determine whether we @@ -177,8 +140,6 @@ }; typedef std::map<__tgt_offload_entry *, TranslationTable> HostEntriesBeginToTransTableTy; -extern HostEntriesBeginToTransTableTy *HostEntriesBeginToTransTable; -extern std::mutex *TrlTblMtx; /// Map between the host ptr and a table index struct TableMap { @@ -189,7 +150,5 @@ : Table(table), Index(index) {} }; typedef std::map HostPtrToTableMapTy; -extern HostPtrToTableMapTy *HostPtrToTableMap; -extern std::mutex *TblMapMtx; #endif diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -10,9 +10,8 @@ // //===----------------------------------------------------------------------===// -#include "device.h" -#include "private.h" -#include "rtl.h" +#include "offloading.h" +#include "omptarget.h" #include #include @@ -21,417 +20,14 @@ #include #include -// List of all plugins that can support offloading. -static const char *RTLNames[] = { - /* PowerPC target */ "libomptarget.rtl.ppc64.so", - /* x86_64 target */ "libomptarget.rtl.x86_64.so", - /* CUDA target */ "libomptarget.rtl.cuda.so", - /* AArch64 target */ "libomptarget.rtl.aarch64.so", - /* SX-Aurora VE target */ "libomptarget.rtl.ve.so", - /* AMDGPU target */ "libomptarget.rtl.amdgpu.so", -}; - -RTLsTy *RTLs; -std::mutex *RTLsMtx; - -HostEntriesBeginToTransTableTy *HostEntriesBeginToTransTable; -std::mutex *TrlTblMtx; - -HostPtrToTableMapTy *HostPtrToTableMap; -std::mutex *TblMapMtx; +OffloadingPlugin *OP; __attribute__((constructor(101))) void init() { DP("Init target library!\n"); - RTLs = new RTLsTy(); - RTLsMtx = new std::mutex(); - HostEntriesBeginToTransTable = new HostEntriesBeginToTransTableTy(); - TrlTblMtx = new std::mutex(); - HostPtrToTableMap = new HostPtrToTableMapTy(); - TblMapMtx = new std::mutex(); + OP = new OffloadingPlugin(); } __attribute__((destructor(101))) void deinit() { DP("Deinit target library!\n"); - delete RTLs; - delete RTLsMtx; - delete HostEntriesBeginToTransTable; - delete TrlTblMtx; - delete HostPtrToTableMap; - delete TblMapMtx; -} - -void RTLsTy::LoadRTLs() { - // Parse environment variable OMP_TARGET_OFFLOAD (if set) - TargetOffloadPolicy = (kmp_target_offload_kind_t) __kmpc_get_target_offload(); - if (TargetOffloadPolicy == tgt_disabled) { - return; - } - - DP("Loading RTLs...\n"); - - // Attempt to open all the plugins and, if they exist, check if the interface - // is correct and if they are supporting any devices. - for (auto *Name : RTLNames) { - DP("Loading library '%s'...\n", Name); - void *dynlib_handle = dlopen(Name, RTLD_NOW); - - if (!dynlib_handle) { - // Library does not exist or cannot be found. - DP("Unable to load library '%s': %s!\n", Name, dlerror()); - continue; - } - - DP("Successfully loaded library '%s'!\n", Name); - - // Retrieve the RTL information from the runtime library. - RTLInfoTy R; - - R.LibraryHandler = dynlib_handle; - R.isUsed = false; - -#ifdef OMPTARGET_DEBUG - R.RTLName = Name; -#endif - - if (!(*((void **)&R.is_valid_binary) = - dlsym(dynlib_handle, "__tgt_rtl_is_valid_binary"))) - continue; - if (!(*((void **)&R.number_of_devices) = - dlsym(dynlib_handle, "__tgt_rtl_number_of_devices"))) - continue; - if (!(*((void **)&R.init_device) = - dlsym(dynlib_handle, "__tgt_rtl_init_device"))) - continue; - if (!(*((void **)&R.load_binary) = - dlsym(dynlib_handle, "__tgt_rtl_load_binary"))) - continue; - if (!(*((void **)&R.data_alloc) = - dlsym(dynlib_handle, "__tgt_rtl_data_alloc"))) - continue; - if (!(*((void **)&R.data_submit) = - dlsym(dynlib_handle, "__tgt_rtl_data_submit"))) - continue; - if (!(*((void **)&R.data_retrieve) = - dlsym(dynlib_handle, "__tgt_rtl_data_retrieve"))) - continue; - if (!(*((void **)&R.data_delete) = - dlsym(dynlib_handle, "__tgt_rtl_data_delete"))) - continue; - if (!(*((void **)&R.run_region) = - dlsym(dynlib_handle, "__tgt_rtl_run_target_region"))) - continue; - if (!(*((void **)&R.run_team_region) = - dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region"))) - continue; - - // Optional functions - *((void **)&R.init_requires) = - dlsym(dynlib_handle, "__tgt_rtl_init_requires"); - *((void **)&R.data_submit_async) = - dlsym(dynlib_handle, "__tgt_rtl_data_submit_async"); - *((void **)&R.data_retrieve_async) = - dlsym(dynlib_handle, "__tgt_rtl_data_retrieve_async"); - *((void **)&R.run_region_async) = - dlsym(dynlib_handle, "__tgt_rtl_run_target_region_async"); - *((void **)&R.run_team_region_async) = - dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async"); - *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize"); - *((void **)&R.data_exchange) = - dlsym(dynlib_handle, "__tgt_rtl_data_exchange"); - *((void **)&R.data_exchange_async) = - dlsym(dynlib_handle, "__tgt_rtl_data_exchange_async"); - *((void **)&R.is_data_exchangable) = - dlsym(dynlib_handle, "__tgt_rtl_is_data_exchangable"); - - // No devices are supported by this RTL? - if (!(R.NumberOfDevices = R.number_of_devices())) { - DP("No devices supported in this RTL\n"); - continue; - } - - DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(), - R.NumberOfDevices); - - // The RTL is valid! Will save the information in the RTLs list. - AllRTLs.push_back(R); - } - - DP("RTLs loaded!\n"); - - return; -} - -//////////////////////////////////////////////////////////////////////////////// -// Functionality for registering libs - -static void RegisterImageIntoTranslationTable(TranslationTable &TT, - RTLInfoTy &RTL, __tgt_device_image *image) { - - // same size, as when we increase one, we also increase the other. - assert(TT.TargetsTable.size() == TT.TargetsImages.size() && - "We should have as many images as we have tables!"); - - // Resize the Targets Table and Images to accommodate the new targets if - // required - unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices; - - if (TT.TargetsTable.size() < TargetsTableMinimumSize) { - TT.TargetsImages.resize(TargetsTableMinimumSize, 0); - TT.TargetsTable.resize(TargetsTableMinimumSize, 0); - } - - // Register the image in all devices for this target type. - for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) { - // If we are changing the image we are also invalidating the target table. - if (TT.TargetsImages[RTL.Idx + i] != image) { - TT.TargetsImages[RTL.Idx + i] = image; - TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table. - } - } -} - -//////////////////////////////////////////////////////////////////////////////// -// Functionality for registering Ctors/Dtors - -static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc, - __tgt_device_image *img, RTLInfoTy *RTL) { - - for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) { - DeviceTy &Device = Devices[RTL->Idx + i]; - Device.PendingGlobalsMtx.lock(); - Device.HasPendingGlobals = true; - for (__tgt_offload_entry *entry = img->EntriesBegin; - entry != img->EntriesEnd; ++entry) { - if (entry->flags & OMP_DECLARE_TARGET_CTOR) { - DP("Adding ctor " DPxMOD " to the pending list.\n", - DPxPTR(entry->addr)); - Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr); - } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) { - // Dtors are pushed in reverse order so they are executed from end - // to beginning when unregistering the library! - DP("Adding dtor " DPxMOD " to the pending list.\n", - DPxPTR(entry->addr)); - Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr); - } - - if (entry->flags & OMP_DECLARE_TARGET_LINK) { - DP("The \"link\" attribute is not yet supported!\n"); - } - } - Device.PendingGlobalsMtx.unlock(); - } -} - -void RTLsTy::RegisterRequires(int64_t flags) { - // TODO: add more elaborate check. - // Minimal check: only set requires flags if previous value - // is undefined. This ensures that only the first call to this - // function will set the requires flags. All subsequent calls - // will be checked for compatibility. - assert(flags != OMP_REQ_UNDEFINED && - "illegal undefined flag for requires directive!"); - if (RequiresFlags == OMP_REQ_UNDEFINED) { - RequiresFlags = flags; - return; - } - - // If multiple compilation units are present enforce - // consistency across all of them for require clauses: - // - reverse_offload - // - unified_address - // - unified_shared_memory - if ((RequiresFlags & OMP_REQ_REVERSE_OFFLOAD) != - (flags & OMP_REQ_REVERSE_OFFLOAD)) { - FATAL_MESSAGE0(1, - "'#pragma omp requires reverse_offload' not used consistently!"); - } - if ((RequiresFlags & OMP_REQ_UNIFIED_ADDRESS) != - (flags & OMP_REQ_UNIFIED_ADDRESS)) { - FATAL_MESSAGE0(1, - "'#pragma omp requires unified_address' not used consistently!"); - } - if ((RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) != - (flags & OMP_REQ_UNIFIED_SHARED_MEMORY)) { - FATAL_MESSAGE0(1, - "'#pragma omp requires unified_shared_memory' not used consistently!"); - } - - // TODO: insert any other missing checks - - DP("New requires flags %" PRId64 " compatible with existing %" PRId64 "!\n", - flags, RequiresFlags); -} - -void RTLsTy::RegisterLib(__tgt_bin_desc *desc) { - // Attempt to load all plugins available in the system. - std::call_once(initFlag, &RTLsTy::LoadRTLs, this); - - RTLsMtx->lock(); - // Register the images with the RTLs that understand them, if any. - for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { - // Obtain the image. - __tgt_device_image *img = &desc->DeviceImages[i]; - - RTLInfoTy *FoundRTL = NULL; - - // Scan the RTLs that have associated images until we find one that supports - // the current image. - for (auto &R : AllRTLs) { - if (!R.is_valid_binary(img)) { - DP("Image " DPxMOD " is NOT compatible with RTL %s!\n", - DPxPTR(img->ImageStart), R.RTLName.c_str()); - continue; - } - - DP("Image " DPxMOD " is compatible with RTL %s!\n", - DPxPTR(img->ImageStart), R.RTLName.c_str()); - - // If this RTL is not already in use, initialize it. - if (!R.isUsed) { - // Initialize the device information for the RTL we are about to use. - DeviceTy device(&R); - size_t start = Devices.size(); - Devices.resize(start + R.NumberOfDevices, device); - for (int32_t device_id = 0; device_id < R.NumberOfDevices; - device_id++) { - // global device ID - Devices[start + device_id].DeviceID = start + device_id; - // RTL local device ID - Devices[start + device_id].RTLDeviceID = device_id; - } - - // Initialize the index of this RTL and save it in the used RTLs. - R.Idx = (UsedRTLs.empty()) - ? 0 - : UsedRTLs.back()->Idx + UsedRTLs.back()->NumberOfDevices; - assert((size_t) R.Idx == start && - "RTL index should equal the number of devices used so far."); - R.isUsed = true; - UsedRTLs.push_back(&R); - - DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx); - } - - // Initialize (if necessary) translation table for this library. - TrlTblMtx->lock(); - if(!HostEntriesBeginToTransTable->count(desc->HostEntriesBegin)){ - TranslationTable &tt = - (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin]; - tt.HostTable.EntriesBegin = desc->HostEntriesBegin; - tt.HostTable.EntriesEnd = desc->HostEntriesEnd; - } - - // Retrieve translation table for this library. - TranslationTable &TransTable = - (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin]; - - DP("Registering image " DPxMOD " with RTL %s!\n", - DPxPTR(img->ImageStart), R.RTLName.c_str()); - RegisterImageIntoTranslationTable(TransTable, R, img); - TrlTblMtx->unlock(); - FoundRTL = &R; - - // Load ctors/dtors for static objects - RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL); - - // if an RTL was found we are done - proceed to register the next image - break; - } - - if (!FoundRTL) { - DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart)); - } - } - RTLsMtx->unlock(); - - - DP("Done registering entries!\n"); -} - -void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) { - DP("Unloading target library!\n"); - - RTLsMtx->lock(); - // Find which RTL understands each image, if any. - for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { - // Obtain the image. - __tgt_device_image *img = &desc->DeviceImages[i]; - - RTLInfoTy *FoundRTL = NULL; - - // Scan the RTLs that have associated images until we find one that supports - // the current image. We only need to scan RTLs that are already being used. - for (auto *R : UsedRTLs) { - - assert(R->isUsed && "Expecting used RTLs."); - - if (!R->is_valid_binary(img)) { - DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n", - DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); - continue; - } - - DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n", - DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); - - FoundRTL = R; - - // Execute dtors for static objects if the device has been used, i.e. - // if its PendingCtors list has been emptied. - for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) { - DeviceTy &Device = Devices[FoundRTL->Idx + i]; - Device.PendingGlobalsMtx.lock(); - if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) { - for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { - int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, - NULL, 1, 1, true /*team*/); - if (rc != OFFLOAD_SUCCESS) { - DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor)); - } - } - // Remove this library's entry from PendingCtorsDtors - Device.PendingCtorsDtors.erase(desc); - } - Device.PendingGlobalsMtx.unlock(); - } - - DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n", - DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); - - break; - } - - // if no RTL was found proceed to unregister the next image - if (!FoundRTL){ - DP("No RTLs in use support the image " DPxMOD "!\n", - DPxPTR(img->ImageStart)); - } - } - RTLsMtx->unlock(); - DP("Done unregistering images!\n"); - - // Remove entries from HostPtrToTableMap - TblMapMtx->lock(); - for (__tgt_offload_entry *cur = desc->HostEntriesBegin; - cur < desc->HostEntriesEnd; ++cur) { - HostPtrToTableMap->erase(cur->addr); - } - - // Remove translation table for this descriptor. - auto tt = HostEntriesBeginToTransTable->find(desc->HostEntriesBegin); - if (tt != HostEntriesBeginToTransTable->end()) { - DP("Removing translation table for descriptor " DPxMOD "\n", - DPxPTR(desc->HostEntriesBegin)); - HostEntriesBeginToTransTable->erase(tt); - } else { - DP("Translation table for descriptor " DPxMOD " cannot be found, probably " - "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin)); - } - - TblMapMtx->unlock(); - - // TODO: Remove RTL and the devices it manages if it's not used anymore? - // TODO: Write some RTL->unload_image(...) function? - - DP("Done unregistering library!\n"); + delete OP; }