diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -52,6 +52,8 @@ OMP_TGT_MAPTYPE_CLOSE = 0x400, // runtime error if not already allocated OMP_TGT_MAPTYPE_PRESENT = 0x1000, + // descriptor for non-contiguous target-update + OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000, // member of struct, member given by [16 MSBs] - 1 OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 }; @@ -123,6 +125,13 @@ void *Queue = nullptr; }; +/// This struct is a record of non-contiguous information +struct __tgt_target_non_contig { + uint64_t Offset; + uint64_t Count; + uint64_t Stride; +}; + #ifdef __cplusplus extern "C" { #endif diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -277,7 +277,7 @@ return rc; } -// Used by targetDataBegin, targetDataEnd, target_data_update and target. +// Used by targetDataBegin, targetDataEnd, targetDataUpdate and target. // Return the target pointer begin (where the data will be moved). // Decrement the reference counter if called from targetDataEnd. void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -273,8 +273,8 @@ } DeviceTy &Device = PM->Devices[device_id]; - int rc = target_data_update(Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers); + int rc = targetDataUpdate(Device, arg_num, args_base, args, arg_sizes, + arg_types, arg_names, arg_mappers); HandleTargetOutcome(rc == OFFLOAD_SUCCESS); } diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -639,122 +639,182 @@ return OFFLOAD_SUCCESS; } -/// Internal function to pass data to/from the target. -// async_info_ptr is currently unused, added here so target_data_update has the -// same signature as targetDataBegin and targetDataEnd. -int target_data_update(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - map_var_info_t *arg_names, void **arg_mappers, - __tgt_async_info *async_info_ptr) { - // process each input. - for (int32_t i = 0; i < arg_num; ++i) { - if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || - (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) - continue; - - if (arg_mappers && arg_mappers[i]) { - // Instead of executing the regular path of target_data_update, call the - // targetDataMapper variant which will call target_data_update again - // with new arguments. - DP("Calling targetDataMapper for the %dth argument\n", i); - - int rc = - targetDataMapper(Device, args_base[i], args[i], arg_sizes[i], - arg_types[i], arg_mappers[i], target_data_update); +static int targetDataContiguous(DeviceTy &Device, void *ArgsBase, + void *HstPtrBegin, int64_t ArgSize, + int64_t ArgType) { + bool IsLast, IsHostPtr; + void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false, + IsHostPtr, /*MustContain=*/true); + if (!TgtPtrBegin) { + DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); + if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { + MESSAGE("device mapping required by 'present' motion modifier does not " + "exist for host address " DPxMOD " (%" PRId64 " bytes)", + DPxPTR(HstPtrBegin), ArgSize); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; + } - if (rc != OFFLOAD_SUCCESS) { - REPORT( - "Call to target_data_update via targetDataMapper for custom mapper" - " failed.\n"); - return OFFLOAD_FAIL; - } + if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + TgtPtrBegin == HstPtrBegin) { + DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", + DPxPTR(HstPtrBegin)); + return OFFLOAD_SUCCESS; + } - // Skip the rest of this function, continue to the next argument. - continue; + if (ArgType & OMP_TGT_MAPTYPE_FROM) { + DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", + ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, nullptr); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Copying data from device failed.\n"); + return OFFLOAD_FAIL; } - void *HstPtrBegin = args[i]; - int64_t MapSize = arg_sizes[i]; - bool IsLast, IsHostPtr; - void *TgtPtrBegin = Device.getTgtPtrBegin( - HstPtrBegin, MapSize, IsLast, false, IsHostPtr, /*MustContain=*/true); - if (!TgtPtrBegin) { - DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); - if (arg_types[i] & OMP_TGT_MAPTYPE_PRESENT) { - MESSAGE("device mapping required by 'present' motion modifier does not " - "exist for host address " DPxMOD " (%" PRId64 " bytes)", - DPxPTR(HstPtrBegin), MapSize); - return OFFLOAD_FAIL; - } - continue; + uintptr_t LB = (uintptr_t)HstPtrBegin; + uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize; + Device.ShadowMtx.lock(); + for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin(); + IT != Device.ShadowPtrMap.end(); ++IT) { + void **ShadowHstPtrAddr = (void **)IT->first; + if ((uintptr_t)ShadowHstPtrAddr < LB) + continue; + if ((uintptr_t)ShadowHstPtrAddr >= UB) + break; + DP("Restoring original host pointer value " DPxMOD + " for host pointer " DPxMOD "\n", + DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); + *ShadowHstPtrAddr = IT->second.HstPtrVal; } + Device.ShadowMtx.unlock(); + } - if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - TgtPtrBegin == HstPtrBegin) { - DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", - DPxPTR(HstPtrBegin)); - continue; + if (ArgType & OMP_TGT_MAPTYPE_TO) { + DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", + ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, nullptr); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Copying data to device failed.\n"); + return OFFLOAD_FAIL; } - if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { - DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.retrieveData(HstPtrBegin, TgtPtrBegin, MapSize, nullptr); - if (rt != OFFLOAD_SUCCESS) { - REPORT("Copying data from device failed.\n"); + uintptr_t LB = (uintptr_t)HstPtrBegin; + uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize; + Device.ShadowMtx.lock(); + for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin(); + IT != Device.ShadowPtrMap.end(); ++IT) { + void **ShadowHstPtrAddr = (void **)IT->first; + if ((uintptr_t)ShadowHstPtrAddr < LB) + continue; + if ((uintptr_t)ShadowHstPtrAddr >= UB) + break; + DP("Restoring original target pointer value " DPxMOD " for target " + "pointer " DPxMOD "\n", + DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr)); + Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal, + sizeof(void *), nullptr); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Copying data to device failed.\n"); + Device.ShadowMtx.unlock(); return OFFLOAD_FAIL; } + } + Device.ShadowMtx.unlock(); + } + return OFFLOAD_SUCCESS; +} - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; - Device.ShadowMtx.lock(); - for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); - it != Device.ShadowPtrMap.end(); ++it) { - void **ShadowHstPtrAddr = (void**) it->first; - if ((uintptr_t) ShadowHstPtrAddr < lb) - continue; - if ((uintptr_t) ShadowHstPtrAddr >= ub) - break; - DP("Restoring original host pointer value " DPxMOD " for host pointer " - DPxMOD "\n", DPxPTR(it->second.HstPtrVal), - DPxPTR(ShadowHstPtrAddr)); - *ShadowHstPtrAddr = it->second.HstPtrVal; +static int targetDataNonContiguous(DeviceTy &Device, void *ArgsBase, + __tgt_target_non_contig *NonContig, + uint64_t Size, int64_t ArgType, + int CurrentDim, int DimSize, + uint64_t Offset) { + int Ret = OFFLOAD_SUCCESS; + if (CurrentDim < DimSize) { + for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { + uint64_t CurOffset = + (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; + // we only need to transfer the first element for the last dimension + // since we've already got a contiguous piece. + if (CurrentDim != DimSize - 1 || I == 0) { + Ret = targetDataNonContiguous(Device, ArgsBase, NonContig, Size, + ArgType, CurrentDim + 1, DimSize, + Offset + CurOffset); + // Stop the whole process if any contiguous piece returns anything + // other than OFFLOAD_SUCCESS. + if (Ret != OFFLOAD_SUCCESS) + return Ret; } - Device.ShadowMtx.unlock(); } + } else { + char *Ptr = (char *)ArgsBase + Offset; + DP("Transfer of non-contiguous : host ptr %lx offset %ld len %ld\n", + (uint64_t)Ptr, Offset, Size); + Ret = targetDataContiguous(Device, ArgsBase, Ptr, Size, ArgType); + } + return Ret; +} - if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { - DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.submitData(TgtPtrBegin, HstPtrBegin, MapSize, nullptr); - if (rt != OFFLOAD_SUCCESS) { - REPORT("Copying data to device failed.\n"); +static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, + int32_t DimSize) { + int RemovedDim = 0; + for (int I = DimSize - 1; I > 0; --I) { + if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) + RemovedDim++; + } + return RemovedDim; +} + +/// Internal function to pass data to/from the target. +// async_info_ptr is currently unused, added here so targetDataUpdate has the +// same signature as targetDataBegin and targetDataEnd. +int targetDataUpdate(DeviceTy &Device, int32_t ArgNum, void **ArgsBase, + void **Args, int64_t *ArgSizes, int64_t *ArgTypes, + map_var_info_t *ArgNames, void **ArgMappers, + __tgt_async_info *AsyncInfoPtr) { + // process each input. + for (int32_t I = 0; I < ArgNum; ++I) { + if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || + (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) + continue; + + if (ArgMappers && ArgMappers[I]) { + // Instead of executing the regular path of targetDataUpdate, call the + // targetDataMapper variant which will call targetDataUpdate again + // with new arguments. + DP("Calling targetDataMapper for the %dth argument\n", I); + + int Ret = targetDataMapper(Device, ArgsBase[I], Args[I], ArgSizes[I], + ArgTypes[I], ArgMappers[I], targetDataUpdate); + + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper" + " failed.\n"); return OFFLOAD_FAIL; } - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; - Device.ShadowMtx.lock(); - for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); - it != Device.ShadowPtrMap.end(); ++it) { - void **ShadowHstPtrAddr = (void **)it->first; - if ((uintptr_t)ShadowHstPtrAddr < lb) - continue; - if ((uintptr_t)ShadowHstPtrAddr >= ub) - break; - DP("Restoring original target pointer value " DPxMOD " for target " - "pointer " DPxMOD "\n", - DPxPTR(it->second.TgtPtrVal), DPxPTR(it->second.TgtPtrAddr)); - rt = Device.submitData(it->second.TgtPtrAddr, &it->second.TgtPtrVal, - sizeof(void *), nullptr); - if (rt != OFFLOAD_SUCCESS) { - REPORT("Copying data to device failed.\n"); - Device.ShadowMtx.unlock(); - return OFFLOAD_FAIL; - } - } - Device.ShadowMtx.unlock(); + // Skip the rest of this function, continue to the next argument. + continue; } + + int Ret = OFFLOAD_SUCCESS; + + if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { + __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; + int32_t DimSize = ArgSizes[I]; + uint64_t Size = + NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; + int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); + Ret = targetDataNonContiguous( + Device, ArgsBase[I], NonContig, Size, ArgTypes[I], + /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0); + } else { + Ret = targetDataContiguous(Device, ArgsBase[I], Args[I], ArgSizes[I], + ArgTypes[I]); + } + if (Ret == OFFLOAD_FAIL) + return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -28,11 +28,10 @@ map_var_info_t *arg_names, void **ArgMappers, __tgt_async_info *AsyncInfo); -extern int target_data_update(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, map_var_info_t *arg_names, - void **arg_mappers, - __tgt_async_info *async_info_ptr = nullptr); +extern int targetDataUpdate(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + map_var_info_t *arg_names, void **arg_mappers, + __tgt_async_info *async_info_ptr = nullptr); extern int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, @@ -68,7 +67,7 @@ typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t); // Function pointer type for target_data_* functions (targetDataBegin, -// targetDataEnd and target_data_update). +// targetDataEnd and targetDataUpdate). typedef int (*TargetDataFuncPtrTy)(DeviceTy &, int32_t, void **, void **, int64_t *, int64_t *, map_var_info_t *, void **, __tgt_async_info *); diff --git a/openmp/libomptarget/test/offloading/non_contiguous_update.cpp b/openmp/libomptarget/test/offloading/non_contiguous_update.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/non_contiguous_update.cpp @@ -0,0 +1,101 @@ +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=DEBUG +// REQUIRES: libomptarget-debug + +#include +#include +#include + +// Data structure definitions copied from OpenMP RTL. +struct __tgt_target_non_contig { + int64_t offset; + int64_t width; + int64_t stride; +}; + +enum tgt_map_type { + OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000 +}; + +// OpenMP RTL interfaces +#ifdef __cplusplus +extern "C" { +#endif +void __tgt_target_data_update(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types); +#ifdef __cplusplus +} +#endif + +int main() { + // case 1 + // int arr[3][4][5][6]; + // #pragma omp target update to(arr[0:2][1:3][1:2][:]) + // set up descriptor + __tgt_target_non_contig non_contig[5] = { + {0, 2, 480}, {1, 3, 120}, {1, 2, 24}, {0, 6, 4}, {0, 1, 4}}; + int64_t size = 4, type = OMP_TGT_MAPTYPE_NON_CONTIG; + + void *base; + void *begin = &non_contig; + int64_t *sizes = &size; + int64_t *types = &type; + + // The below diagram is the visualization of the non-contiguous transfer after + // optimization. Note that each element represent the innermost dimension + // (unit size = 24) since the stride * count of last dimension is equal to the + // stride of second last dimension. + // + // OOOOO OOOOO OOOOO + // OXXOO OXXOO OOOOO + // OXXOO OXXOO OOOOO + // OXXOO OXXOO OOOOO + __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin, + sizes, types); + // DEBUG: offset 144 + // DEBUG: offset 264 + // DEBUG: offset 384 + // DEBUG: offset 624 + // DEBUG: offset 744 + // DEBUG: offset 864 + + + // case 2 + // double darr[3][4][5]; + // #pragma omp target update to(darr[0:2:2][2:2][:2:2]) + // set up descriptor + __tgt_target_non_contig non_contig_2[4] = { + {0, 2, 320}, {2, 2, 40}, {0, 2, 16}, {0, 1, 8}}; + int64_t size_2 = 4, type_2 = OMP_TGT_MAPTYPE_NON_CONTIG; + + void *base_2; + void *begin_2 = &non_contig_2; + int64_t *sizes_2 = &size_2; + int64_t *types_2 = &type_2; + + // The below diagram is the visualization of the non-contiguous transfer after + // optimization. Note that each element represent the innermost dimension + // (unit size = 24) since the stride * count of last dimension is equal to the + // stride of second last dimension. + // + // OOOOO OOOOO OOOOO + // OOOOO OOOOO OOOOO + // XOXOO OOOOO XOXOO + // XOXOO OOOOO XOXOO + __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base_2, &begin_2, + sizes_2, types_2); + // DEBUG: offset 80 + // DEBUG: offset 96 + // DEBUG: offset 120 + // DEBUG: offset 136 + // DEBUG: offset 400 + // DEBUG: offset 416 + // DEBUG: offset 440 + // DEBUG: offset 456 + return 0; +} +