diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -24,8 +24,6 @@ int DebugLevel = 0; #endif // OMPTARGET_DEBUG - - /* All begin addresses for partially mapped structs must be 8-aligned in order * to ensure proper alignment of members. E.g. * @@ -59,7 +57,7 @@ static const int64_t Alignment = 8; /// Map global data and execute pending ctors -static int InitLibrary(DeviceTy& Device) { +static int InitLibrary(DeviceTy &Device) { /* * Map global data */ @@ -68,9 +66,9 @@ Device.PendingGlobalsMtx.lock(); TrlTblMtx->lock(); - for (HostEntriesBeginToTransTableTy::iterator - ii = HostEntriesBeginToTransTable->begin(); - ii != HostEntriesBeginToTransTable->end(); ++ii) { + for (HostEntriesBeginToTransTableTy::iterator ii = + HostEntriesBeginToTransTable->begin(); + ii != HostEntriesBeginToTransTable->end(); ++ii) { TranslationTable *TransTable = &ii->second; if (TransTable->HostTable.EntriesBegin == TransTable->HostTable.EntriesEnd) { @@ -92,8 +90,8 @@ break; } // 2) load image into the target table. - __tgt_target_table *TargetTable = - TransTable->TargetsTable[device_id] = Device.load_binary(img); + __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] = + Device.load_binary(img); // Unable to get table for this image: invalidate image and fail. if (!TargetTable) { DP("Unable to generate entries table for device id %d.\n", device_id); @@ -137,8 +135,9 @@ if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) continue; DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" - "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), - CurrDeviceEntry->size); + "\n", + DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), + CurrDeviceEntry->size); Device.HostDataToTargetMap.emplace( (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, @@ -167,7 +166,7 @@ for (auto &entry : lib.second.PendingCtors) { void *ctor = entry; int rc = target(device_id, ctor, 0, NULL, NULL, NULL, NULL, NULL, 1, - 1, true /*team*/); + 1, true /*team*/); if (rc != OFFLOAD_SUCCESS) { DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); Device.PendingGlobalsMtx.unlock(); @@ -225,7 +224,7 @@ MapperComponentsTy MapperComponents; MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper); (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, - arg_type); + arg_type); // Construct new arrays for args_base, args, arg_sizes and arg_types // using the information in MapperComponents and call the corresponding @@ -292,14 +291,15 @@ // Look at the next argument - if that is MEMBER_OF this one, then this one // is a combined entry. int64_t padding = 0; - const int next_i = i+1; + const int next_i = i + 1; if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num && getParentIndex(arg_types[next_i]) == i) { padding = (int64_t)HstPtrBegin % Alignment; if (padding) { DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD - "\n", padding, DPxPTR(HstPtrBegin)); - HstPtrBegin = (char *) HstPtrBegin - padding; + "\n", + padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *)HstPtrBegin - padding; data_size += padding; } } @@ -344,8 +344,9 @@ return OFFLOAD_FAIL; } DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" - "\n", sizeof(void *), DPxPTR(PointerTgtPtrBegin), - (Pointer_IsNew ? "" : " not")); + "\n", + sizeof(void *), DPxPTR(PointerTgtPtrBegin), + (Pointer_IsNew ? "" : " not")); Pointer_HstPtrBegin = HstPtrBase; // modify current entry. HstPtrBase = *(void **)HstPtrBase; @@ -364,8 +365,8 @@ return OFFLOAD_FAIL; } DP("There are %" PRId64 " bytes allocated at target address " DPxMOD - " - is%s new\n", data_size, DPxPTR(TgtPtrBegin), - (IsNew ? "" : " not")); + " - is%s new\n", + data_size, DPxPTR(TgtPtrBegin), (IsNew ? "" : " not")); if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; @@ -643,9 +644,9 @@ /// Internal function to pass data to/from the target. // async_info_ptr is currently unused, added here so target_data_update has the // same signature as targetDataBegin and targetDataEnd. -int target_data_update(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - void **arg_mappers, __tgt_async_info *async_info_ptr) { +int target_data_update(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers, __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || @@ -678,7 +679,8 @@ void *TgtPtrBegin = Device.getTgtPtrBegin( HstPtrBegin, MapSize, IsLast, false, IsHostPtr, /*MustContain=*/true); if (!TgtPtrBegin) { - DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); + DP("hst data:" DPxMOD " not found, becomes a noop\n", + DPxPTR(HstPtrBegin)); if (arg_types[i] & OMP_TGT_MAPTYPE_PRESENT) { MESSAGE("device mapping required by 'present' motion modifier does not " "exist for host address " DPxMOD " (%" PRId64 " bytes)", @@ -697,26 +699,26 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); int rt = Device.retrieveData(HstPtrBegin, TgtPtrBegin, MapSize, nullptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data from device failed.\n"); return OFFLOAD_FAIL; } - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + uintptr_t lb = (uintptr_t)HstPtrBegin; + uintptr_t ub = (uintptr_t)HstPtrBegin + MapSize; Device.ShadowMtx.lock(); for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); - it != Device.ShadowPtrMap.end(); ++it) { - void **ShadowHstPtrAddr = (void**) it->first; - if ((uintptr_t) ShadowHstPtrAddr < lb) + it != Device.ShadowPtrMap.end(); ++it) { + void **ShadowHstPtrAddr = (void **)it->first; + if ((uintptr_t)ShadowHstPtrAddr < lb) continue; - if ((uintptr_t) ShadowHstPtrAddr >= ub) + if ((uintptr_t)ShadowHstPtrAddr >= ub) break; - DP("Restoring original host pointer value " DPxMOD " for host pointer " - DPxMOD "\n", DPxPTR(it->second.HstPtrVal), - DPxPTR(ShadowHstPtrAddr)); + DP("Restoring original host pointer value " DPxMOD + " for host pointer " DPxMOD "\n", + DPxPTR(it->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); *ShadowHstPtrAddr = it->second.HstPtrVal; } Device.ShadowMtx.unlock(); @@ -724,18 +726,18 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); int rt = Device.submitData(TgtPtrBegin, HstPtrBegin, MapSize, nullptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; } - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + uintptr_t lb = (uintptr_t)HstPtrBegin; + uintptr_t ub = (uintptr_t)HstPtrBegin + MapSize; Device.ShadowMtx.lock(); for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); - it != Device.ShadowPtrMap.end(); ++it) { + it != Device.ShadowPtrMap.end(); ++it) { void **ShadowHstPtrAddr = (void **)it->first; if ((uintptr_t)ShadowHstPtrAddr < lb) continue; @@ -822,6 +824,24 @@ return LoopTripCount; } +/// A data structure for the information of first-private arguments. We can use +/// this information to optimize data transfer by packing all first-private +/// arguments and transfer them all at once. +struct FPArgInfoTy { + /// The index of the element in \p FPArrays corresponding to the argument + int Index; + /// Host pointer + void *HstPtr; + /// Size of the argument + int64_t Size; + /// Aligned size + int64_t AlignedSize; + + FPArgInfoTy(int Index, void *HstPtr, int64_t Size) + : Index(Index), HstPtr(HstPtr), Size(Size), + AlignedSize(Size + Size % Alignment) {} +}; + /// Process data before launching the kernel, including calling targetDataBegin /// to map and transfer data to target device, transferring (first-)private /// variables. @@ -830,7 +850,7 @@ int64_t *ArgTypes, void **ArgMappers, std::vector &TgtArgs, std::vector &TgtOffsets, - std::vector &FPArrays, + std::vector &FPArray, __tgt_async_info *AsyncInfo) { DeviceTy &Device = Devices[DeviceId]; int Ret = targetDataBegin(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, @@ -842,6 +862,10 @@ // List of (first-)private arrays allocated for this target region std::vector TgtArgsPositions(ArgNum, -1); + std::vector PrivateArgInfo; + size_t FPArgSize = 0; + // TODO: What would be the best value here? Should we make it configurable? + constexpr const int64_t FPArgSizeThreshold = 1024; for (int32_t I = 0; I < ArgNum; ++I) { if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { @@ -900,33 +924,55 @@ TgtPtrBegin = HstPtrBase; TgtBaseOffset = 0; } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { - // Allocate memory for (first-)private array - TgtPtrBegin = Device.allocData(ArgSizes[I], HstPtrBegin); - if (!TgtPtrBegin) { - DP("Data allocation for %sprivate array " DPxMOD " failed, " - "abort target.\n", - (ArgTypes[I] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), - DPxPTR(HstPtrBegin)); - return OFFLOAD_FAIL; - } - FPArrays.push_back(TgtPtrBegin); TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; + const bool MapTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO; + const int64_t ArgSize = ArgSizes[I]; + // If the size does not exceed the threshold, or the argument doesn't need + // to be mapped to target device, we will allocate data immediately. + if (ArgSize > FPArgSizeThreshold || !MapTo) { + // Allocate memory for (first-)private array + TgtPtrBegin = Device.allocData(ArgSize, HstPtrBegin); + if (!TgtPtrBegin) { + DP("Data allocation for %sprivate array " DPxMOD " failed, " + "abort target.\n", + (ArgTypes[I] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin)); + return OFFLOAD_FAIL; + } #ifdef OMPTARGET_DEBUG - void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); - DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " - "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", - ArgSizes[I], DPxPTR(TgtPtrBegin), - (ArgTypes[I] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), - DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); + void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); + DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " + "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", + ArgSize, DPxPTR(TgtPtrBegin), + (ArgTypes[I] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); #endif - // If first-private, copy data from host - if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) { - Ret = - Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSizes[I], AsyncInfo); - if (Ret != OFFLOAD_SUCCESS) { - DP("Copying data to device failed, failed.\n"); - return OFFLOAD_FAIL; + // If first-private, copy data from host + if (MapTo) { + Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + DP("Copying data to device failed, failed.\n"); + return OFFLOAD_FAIL; + } } + FPArray.push_back(TgtPtrBegin); + } else { + DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n", + DPxPTR(HstPtrBegin), ArgSize); + // When reach this point, the argument must meet all following + // requirements: + // 1. Its size does not exceed the threshold; + // 2. It must be first-private (needs to be mapped to target device). + // We will pack all this kind of arguments to transfer then all at once + // to reduce the number of data transfer. We will not take + // non-first-private arguments, aka. private arguments that doesn't need + // to be mapped to target device, into account because data allocation + // can be very efficient with memory manager. + + // Placeholder value + TgtPtrBegin = nullptr; + PrivateArgInfo.emplace_back(TgtArgs.size(), HstPtrBegin, ArgSize); + FPArgSize += PrivateArgInfo.back().AlignedSize; } } else { if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) @@ -948,6 +994,45 @@ assert(TgtArgs.size() == TgtOffsets.size() && "Size mismatch in arguments and offsets"); + if (!PrivateArgInfo.empty()) { + assert(FPArgSize != 0 && "FPArgSize is 0 but PrivateArgInfo is empty"); + std::vector Buffer(FPArgSize, 0); + auto Itr = Buffer.begin(); + // Copy all host data to this buffer + for (FPArgInfoTy &Info : PrivateArgInfo) { + const char *Ptr = reinterpret_cast(Info.HstPtr); + const int64_t Size = Info.Size; + std::copy(Ptr, Ptr + Size, Itr); + Itr = std::next(Itr, Info.AlignedSize); + } + // Allocate target memory + void *TgtPtr = Device.allocData(FPArgSize, Buffer.data()); + if (TgtPtr == nullptr) { + DP("Failed to allocate target memory for private arguments.\n"); + return OFFLOAD_FAIL; + } + FPArray.push_back(TgtPtr); + DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n", + FPArgSize, DPxPTR(TgtPtr)); + // Transfer data to target device + Ret = Device.submitData(TgtPtr, Buffer.data(), FPArgSize, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + DP("Failed to submit data of private arguments.\n"); + return OFFLOAD_FAIL; + } + // Fill in all placeholder pointers + auto TP = reinterpret_cast(TgtPtr); + for (FPArgInfoTy &Info : PrivateArgInfo) { + void *&Ptr = TgtArgs[Info.Index]; + assert(Ptr == nullptr && "Target pointer is already set by mistaken"); + Ptr = reinterpret_cast(TP); + TP += Info.AlignedSize; + DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD + "\n", + DPxPTR(Info.HstPtr), Info.Size, DPxPTR(Ptr)); + } + } + return OFFLOAD_SUCCESS; } @@ -1016,12 +1101,12 @@ std::vector TgtArgs; std::vector TgtOffsets; - std::vector FPArrays; + std::vector FPArray; // Process data, such as data mapping, before launching the kernel int Ret = processDataBefore(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, ArgMappers, TgtArgs, - TgtOffsets, FPArrays, &AsyncInfo); + TgtOffsets, FPArray, &AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { DP("Failed to process data before launching the kernel.\n"); return OFFLOAD_FAIL; @@ -1051,7 +1136,7 @@ // Transfer data back and deallocate target memory for (first-)private // variables Ret = processDataAfter(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes, - ArgTypes, ArgMappers, FPArrays, &AsyncInfo); + ArgTypes, ArgMappers, FPArray, &AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { DP("Failed to process data after launching the kernel.\n"); return OFFLOAD_FAIL; diff --git a/openmp/libomptarget/test/mapping/private_mapping.c b/openmp/libomptarget/test/mapping/private_mapping.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/mapping/private_mapping.c @@ -0,0 +1,46 @@ +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu +// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \ +// RUN: | %fcheck-aarch64-unknown-linux-gnu + +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \ +// RUN: | %fcheck-powerpc64-ibm-linux-gnu + +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \ +// RUN: | %fcheck-powerpc64le-ibm-linux-gnu + +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu +// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \ +// RUN: | %fcheck-x86_64-pc-linux-gnu + +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda +// RUN: %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 \ +// RUN: | %fcheck-nvptx64-nvidia-cuda + +#include +#include + +int main() { + int data1[3] = {1}, data2[3] = {2}, data3[3] = {3}; + int sum[16] = {0}; +#pragma omp target teams distribute parallel for map(tofrom: sum) \ + firstprivate(data1, data2, data3) + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 3; ++j) { + sum[i] += data1[j]; + sum[i] += data2[j]; + sum[i] += data3[j]; + } + } + + for (int i = 0; i < 16; ++i) { + assert(sum[i] == 6); + } + + printf("PASS\n"); + + return 0; +} + +// CHECK: PASS