Index: openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp +++ openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp @@ -19,7 +19,7 @@ #include #include -#include "omptarget.h" +#include "omptargetplugin.h" #ifndef TARGET_NAME #define TARGET_NAME CUDA @@ -473,7 +473,7 @@ return DeviceInfo.getOffloadEntriesTable(device_id); } -void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) { +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { if (size == 0) { return NULL; } @@ -559,8 +559,8 @@ } int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit, - uint64_t loop_tripcount) { + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount) { // Set the context we are using. CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); if (err != CUDA_SUCCESS) { @@ -571,9 +571,12 @@ // All args are references. std::vector args(arg_num); + std::vector ptrs(arg_num); - for (int32_t i = 0; i < arg_num; ++i) - args[i] = &tgt_args[i]; + for (int32_t i = 0; i < arg_num; ++i) { + ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + args[i] = &ptrs[i]; + } KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; @@ -678,12 +681,12 @@ } int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, int32_t arg_num) { + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { // use one team and the default number of threads. const int32_t team_num = 1; const int32_t thread_limit = 0; return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - arg_num, team_num, thread_limit, 0); + tgt_offsets, arg_num, team_num, thread_limit, 0); } #ifdef __cplusplus Index: openmp/trunk/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp =================================================================== --- openmp/trunk/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ openmp/trunk/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -22,7 +22,7 @@ #include #include -#include "omptarget.h" +#include "omptargetplugin.h" #ifndef TARGET_NAME #define TARGET_NAME Generic ELF - 64bit @@ -251,7 +251,7 @@ return DeviceInfo.getOffloadEntriesTable(device_id); } -void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) { +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { void *ptr = malloc(size); return ptr; } @@ -274,8 +274,8 @@ } int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit, - uint64_t loop_tripcount /*not used*/) { + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount /*not used*/) { // ignore team num and thread limit. // Use libffi to launch execution. @@ -284,9 +284,12 @@ // All args are references. std::vector args_types(arg_num, &ffi_type_pointer); std::vector args(arg_num); + std::vector ptrs(arg_num); - for (int32_t i = 0; i < arg_num; ++i) - args[i] = &tgt_args[i]; + for (int32_t i = 0; i < arg_num; ++i) { + ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + args[i] = &ptrs[i]; + } ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num, &ffi_type_void, &args_types[0]); @@ -303,10 +306,10 @@ } int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, int32_t arg_num) { + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { // use one team and one thread. return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - arg_num, 1, 1, 0); + tgt_offsets, arg_num, 1, 1, 0); } #ifdef __cplusplus Index: openmp/trunk/libomptarget/src/omptarget.h =================================================================== --- openmp/trunk/libomptarget/src/omptarget.h +++ openmp/trunk/libomptarget/src/omptarget.h @@ -16,6 +16,7 @@ #define _OMPTARGET_H_ #include +#include #define OFFLOAD_SUCCESS (0) #define OFFLOAD_FAIL (~0) Index: openmp/trunk/libomptarget/src/omptarget.cpp =================================================================== --- openmp/trunk/libomptarget/src/omptarget.cpp +++ openmp/trunk/libomptarget/src/omptarget.cpp @@ -162,10 +162,11 @@ int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size); int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); - int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, int32_t TgtVarsSize); + int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize); int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, - int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit, - uint64_t LoopTripCount); + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, + int32_t ThreadLimit, uint64_t LoopTripCount); private: // Call to RTL @@ -181,13 +182,14 @@ typedef int32_t(number_of_devices_ty)(); typedef int32_t(init_device_ty)(int32_t); typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); - typedef void *(data_alloc_ty)(int32_t, int64_t); + typedef void *(data_alloc_ty)(int32_t, int64_t, void *); typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); typedef int32_t(data_delete_ty)(int32_t, void *); - typedef int32_t(run_region_ty)(int32_t, void *, void **, int32_t); - typedef int32_t(run_team_region_ty)(int32_t, void *, void **, int32_t, - int32_t, int32_t, uint64_t); + typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, + int32_t); + typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *, + int32_t, int32_t, int32_t, uint64_t); int32_t Idx; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -471,7 +473,7 @@ } DeviceTy &Device = Devices[device_num]; - rc = Device.RTL->data_alloc(Device.RTLDeviceID, size); + rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL); DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc)); return rc; } @@ -861,7 +863,7 @@ } else if (Size) { // If it is not contained and Size > 0 we should create a new entry for it. IsNew = true; - uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size); + uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin); DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", " "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase), DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp)); @@ -995,16 +997,17 @@ // Run region on device int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, - int32_t TgtVarsSize) { - return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize); + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize) { + return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, + TgtVarsSize); } // Run team region on device. int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, - int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit, - uint64_t LoopTripCount) { - return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize, - NumTeams, ThreadLimit, LoopTripCount); + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, + int32_t ThreadLimit, uint64_t LoopTripCount) { + return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, + TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount); } //////////////////////////////////////////////////////////////////////////////// @@ -2108,6 +2111,7 @@ } std::vector tgt_args; + std::vector tgt_offsets; // List of (first-)private arrays allocated for this target region std::vector fpArrays; @@ -2119,16 +2123,18 @@ } void *HstPtrBegin = args[i]; void *HstPtrBase = args_base[i]; - void *TgtPtrBase; + void *TgtPtrBegin; + ptrdiff_t TgtBaseOffset; bool IsLast; // unused. if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) { DP("Forwarding first-private value " DPxMOD " to the target construct\n", DPxPTR(HstPtrBase)); - TgtPtrBase = HstPtrBase; + TgtPtrBegin = HstPtrBase; + TgtBaseOffset = 0; } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) { // Allocate memory for (first-)private array - void *TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID, - arg_sizes[i]); + TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID, + arg_sizes[i], HstPtrBegin); if (!TgtPtrBegin) { DP ("Data allocation for %sprivate array " DPxMOD " failed\n", (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), @@ -2137,8 +2143,8 @@ break; } else { fpArrays.push_back(TgtPtrBegin); - uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; - TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta); + TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; + void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", arg_sizes[i], DPxPTR(TgtPtrBegin), @@ -2155,24 +2161,29 @@ } } } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { - void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), - IsLast, false); - TgtPtrBase = TgtPtrBegin; // no offset for ptrs. + TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast, + false); + TgtBaseOffset = 0; // no offset for ptrs. DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to " "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase), DPxPTR(HstPtrBase)); } else { - void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], - IsLast, false); - uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; - TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta); + TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, + false); + TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; + void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); } - tgt_args.push_back(TgtPtrBase); + tgt_args.push_back(TgtPtrBegin); + tgt_offsets.push_back(TgtBaseOffset); } // Push omp handle. tgt_args.push_back((void *)0); + tgt_offsets.push_back(0); + + assert(tgt_args.size() == tgt_offsets.size() && + "Size mismatch in arguments and offsets"); // Pop loop trip count uint64_t ltc = Device.loopTripCnt; @@ -2185,10 +2196,11 @@ DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); if (IsTeamConstruct) { rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], tgt_args.size(), team_num, thread_limit, ltc); + &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num, + thread_limit, ltc); } else { rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], tgt_args.size()); + &tgt_args[0], &tgt_offsets[0], tgt_args.size()); } } else { DP("Errors occurred while obtaining target arguments, skipping kernel " Index: openmp/trunk/libomptarget/src/omptargetplugin.h =================================================================== --- openmp/trunk/libomptarget/src/omptargetplugin.h +++ openmp/trunk/libomptarget/src/omptargetplugin.h @@ -0,0 +1,92 @@ +//===-- omptargetplugin.h - Target dependent OpenMP Plugin API --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an interface between target independent OpenMP offload +// runtime library libomptarget and target dependent plugin. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGETPLUGIN_H_ +#define _OMPTARGETPLUGIN_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Return the number of available devices of the type supported by the +// target RTL. +int32_t __tgt_rtl_number_of_devices(void); + +// Return an integer different from zero if the provided device image can be +// supported by the runtime. The functionality is similar to comparing the +// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a +// lightweight query to determine if the RTL is suitable for an image without +// having to load the library, which can be expensive. +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image); + +// Initialize the specified device. In case of success return 0; otherwise +// return an error code. +int32_t __tgt_rtl_init_device(int32_t ID); + +// Pass an executable image section described by image to the specified +// device and prepare an address table of target entities. In case of error, +// return NULL. Otherwise, return a pointer to the built address table. +// Individual entries in the table may also be NULL, when the corresponding +// offload region is not supported on the target device. +__tgt_target_table *__tgt_rtl_load_binary(int32_t ID, + __tgt_device_image *Image); + +// Allocate data on the particular target device, of the specified size. +// HostPtr is a address of the host data the allocated target data +// will be associated with (HostPtr may be NULL if it is not known at +// allocation time, like for example it would be for target data that +// is allocated by omp_target_alloc() API). Return address of the +// allocated data on the target that will be used by libomptarget.so to +// initialize the target data mapping structures. These addresses are +// used to generate a table of target variables to pass to +// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in +// case an error occurred on the target device. +void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr); + +// Pass the data content to the target device using the target address. +// In case of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, + int64_t Size); + +// Retrieve the data content from the target device using its address. +// In case of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, + int64_t Size); + +// De-allocate the data referenced by target ptr on the device. In case of +// success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); + +// Transfer control to the offloaded entry Entry on the target device. +// Args and Offsets are arrays of NumArgs size of target addresses and +// offsets. An offset should be added to the target address before passing it +// to the outlined function on device side. In case of success, return zero. +// Otherwise, return an error code. +int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, + ptrdiff_t *Offsets, int32_t NumArgs); + +// Similar to __tgt_rtl_run_target_region, but additionally specify the +// number of teams to be created and a number of threads in each team. +int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args, + ptrdiff_t *Offsets, int32_t NumArgs, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t loop_tripcount); + +#ifdef __cplusplus +} +#endif + +#endif // _OMPTARGETPLUGIN_H_