diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -822,6 +822,24 @@ return LoopTripCount; } +/// A data structure for the information of first-private arguments. We can use +/// this information to optimize data transfer by packing all first-private +/// arguments and transfer them all at once. +struct FirstPrivateArgInfoTy { + /// The index of the element in \p TgtArgs corresponding to the argument + const int Index; + /// Host pointer begin + const char *HstPtrBegin; + /// Host pointer end + const char *HstPtrEnd; + /// Aligned size + const int64_t AlignedSize; + + FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size) + : Index(Index), HstPtrBegin(reinterpret_cast(HstPtr)), + HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment) {} +}; + /// Process data before launching the kernel, including calling targetDataBegin /// to map and transfer data to target device, transferring (first-)private /// variables. @@ -830,7 +848,7 @@ int64_t *ArgTypes, void **ArgMappers, std::vector &TgtArgs, std::vector &TgtOffsets, - std::vector &FPArrays, + std::vector &FirstPrivateArgs, __tgt_async_info *AsyncInfo) { DeviceTy &Device = Devices[DeviceId]; int Ret = targetDataBegin(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, @@ -842,6 +860,12 @@ // List of (first-)private arrays allocated for this target region std::vector TgtArgsPositions(ArgNum, -1); + std::vector FirstPrivateArgInfo; + int64_t FirstPrivateArgSize = 0; + // TODO: What would be the best value here? Should we make it configurable? + // If the size is larger than this threshold, we will allocate and transfer it + // immediately instead of packing it. + constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; for (int32_t I = 0; I < ArgNum; ++I) { if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { @@ -900,33 +924,56 @@ TgtPtrBegin = HstPtrBase; TgtBaseOffset = 0; } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { - // Allocate memory for (first-)private array - TgtPtrBegin = Device.allocData(ArgSizes[I], HstPtrBegin); - if (!TgtPtrBegin) { - DP("Data allocation for %sprivate array " DPxMOD " failed, " - "abort target.\n", - (ArgTypes[I] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), - DPxPTR(HstPtrBegin)); - return OFFLOAD_FAIL; - } - FPArrays.push_back(TgtPtrBegin); TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; + const bool IsFirstPrivate = ArgTypes[I] & OMP_TGT_MAPTYPE_TO; + const int64_t ArgSize = ArgSizes[I]; + // If the size does not exceed the threshold, or the argument doesn't need + // to be mapped to target device, we will allocate data immediately. + if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate) { + // Allocate memory for (first-)private array + TgtPtrBegin = Device.allocData(ArgSize, HstPtrBegin); + if (!TgtPtrBegin) { + DP("Data allocation for %sprivate array " DPxMOD " failed, " + "abort target.\n", + (ArgTypes[I] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin)); + return OFFLOAD_FAIL; + } #ifdef OMPTARGET_DEBUG - void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); - DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " - "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", - ArgSizes[I], DPxPTR(TgtPtrBegin), - (ArgTypes[I] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), - DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); + void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); + DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " + "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", + ArgSize, DPxPTR(TgtPtrBegin), + (ArgTypes[I] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); #endif - // If first-private, copy data from host - if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) { - Ret = - Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSizes[I], AsyncInfo); - if (Ret != OFFLOAD_SUCCESS) { - DP("Copying data to device failed, failed.\n"); - return OFFLOAD_FAIL; + // If first-private, copy data from host + if (IsFirstPrivate) { + Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + DP("Copying data to device failed, failed.\n"); + return OFFLOAD_FAIL; + } } + FirstPrivateArgs.push_back(TgtPtrBegin); + } else { + DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n", + DPxPTR(HstPtrBegin), ArgSize); + // When reach this point, the argument must meet all following + // requirements: + // 1. Its size does not exceed the threshold (see the comment for + // FirstPrivateArgSizeThreshold); + // 2. It must be first-private (needs to be mapped to target device). + // We will pack all this kind of arguments to transfer them all at once + // to reduce the number of data transfer. We will not take + // non-first-private arguments, aka. private arguments that doesn't need + // to be mapped to target device, into account because data allocation + // can be very efficient with memory manager. + + // Placeholder value + TgtPtrBegin = nullptr; + FirstPrivateArgInfo.emplace_back(TgtArgs.size(), HstPtrBegin, ArgSize); + FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize; } } else { if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) @@ -948,6 +995,46 @@ assert(TgtArgs.size() == TgtOffsets.size() && "Size mismatch in arguments and offsets"); + if (!FirstPrivateArgInfo.empty()) { + assert(FirstPrivateArgSize != 0 && + "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty"); + std::vector Buffer(FirstPrivateArgSize, 0); + auto Itr = Buffer.begin(); + // Copy all host data to this buffer + for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { + std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); + Itr = std::next(Itr, Info.AlignedSize); + } + // Allocate target memory + void *TgtPtr = Device.allocData(FirstPrivateArgSize, Buffer.data()); + if (TgtPtr == nullptr) { + DP("Failed to allocate target memory for private arguments.\n"); + return OFFLOAD_FAIL; + } + FirstPrivateArgs.push_back(TgtPtr); + DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n", + FirstPrivateArgSize, DPxPTR(TgtPtr)); + // Transfer data to target device + Ret = Device.submitData(TgtPtr, Buffer.data(), FirstPrivateArgSize, + AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + DP("Failed to submit data of private arguments.\n"); + return OFFLOAD_FAIL; + } + // Fill in all placeholder pointers + auto TP = reinterpret_cast(TgtPtr); + for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { + void *&Ptr = TgtArgs[Info.Index]; + assert(Ptr == nullptr && "Target pointer is already set by mistaken"); + Ptr = reinterpret_cast(TP); + TP += Info.AlignedSize; + DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD + "\n", + DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin, + DPxPTR(Ptr)); + } + } + return OFFLOAD_SUCCESS; } @@ -1016,12 +1103,12 @@ std::vector TgtArgs; std::vector TgtOffsets; - std::vector FPArrays; + std::vector FPArray; // Process data, such as data mapping, before launching the kernel int Ret = processDataBefore(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, ArgMappers, TgtArgs, - TgtOffsets, FPArrays, &AsyncInfo); + TgtOffsets, FPArray, &AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { DP("Failed to process data before launching the kernel.\n"); return OFFLOAD_FAIL; @@ -1051,7 +1138,7 @@ // Transfer data back and deallocate target memory for (first-)private // variables Ret = processDataAfter(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes, - ArgTypes, ArgMappers, FPArrays, &AsyncInfo); + ArgTypes, ArgMappers, FPArray, &AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { DP("Failed to process data after launching the kernel.\n"); return OFFLOAD_FAIL; diff --git a/openmp/libomptarget/test/mapping/private_mapping.c b/openmp/libomptarget/test/mapping/private_mapping.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/mapping/private_mapping.c @@ -0,0 +1,47 @@ +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu +// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \ +// RUN: | %fcheck-aarch64-unknown-linux-gnu + +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \ +// RUN: | %fcheck-powerpc64-ibm-linux-gnu + +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \ +// RUN: | %fcheck-powerpc64le-ibm-linux-gnu + +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu +// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \ +// RUN: | %fcheck-x86_64-pc-linux-gnu + +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda +// RUN: %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 \ +// RUN: | %fcheck-nvptx64-nvidia-cuda + +#include +#include + +int main() { + int data1[3] = {1}, data2[3] = {2}, data3[3] = {3}; + int sum[16] = {0}; +#pragma omp target teams distribute parallel for map(tofrom \ + : sum) \ + firstprivate(data1, data2, data3) + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 3; ++j) { + sum[i] += data1[j]; + sum[i] += data2[j]; + sum[i] += data3[j]; + } + } + + for (int i = 0; i < 16; ++i) { + assert(sum[i] == 6); + } + + printf("PASS\n"); + + return 0; +} + +// CHECK: PASS