Index: openmp/libomptarget/include/device.h =================================================================== --- openmp/libomptarget/include/device.h +++ openmp/libomptarget/include/device.h @@ -388,6 +388,8 @@ typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> PendingCtorsDtorsPerLibrary; +class AsyncInfoMng; + struct DeviceTy { int32_t DeviceID; RTLInfoTy *RTL; @@ -414,6 +416,10 @@ std::mutex PendingGlobalsMtx; + /// Manages AsyncInfo associated to the device and thread + /// depending on the value of the env variable LIBOMPTARGET_INTRA_THREAD_ASYNC + AsyncInfoMng AIM; + DeviceTy(RTLInfoTy *RTL); // DeviceTy is not copyable DeviceTy(const DeviceTy &D) = delete; Index: openmp/libomptarget/include/omptarget.h =================================================================== --- openmp/libomptarget/include/omptarget.h +++ openmp/libomptarget/include/omptarget.h @@ -17,12 +17,16 @@ #include #include #include +#include #include #include +#include #include #include +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/SmallVector.h" #define OFFLOAD_SUCCESS (0) @@ -199,6 +203,8 @@ class AsyncInfoTy { public: enum class SyncTy { BLOCKING, NON_BLOCKING }; + /// Specify if the queue has data transfer operation + bool HasDataTransfer = false; private: /// Locations we used in (potentially) asynchronous calls which should live @@ -218,9 +224,20 @@ /// Synchronization method to be used. SyncTy SyncType; - AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING) - : Device(Device), SyncType(SyncType) {} - ~AsyncInfoTy() { synchronize(); } +private: + /// Enables/Disable synchronization in destructor + const bool ShouldSyncWhenDestroyed; + +public: + AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING, + bool ShouldSyncWhenDestroyed = true) + : Device(Device), SyncType(SyncType), + ShouldSyncWhenDestroyed(ShouldSyncWhenDestroyed) {} + ~AsyncInfoTy() { + if (ShouldSyncWhenDestroyed) + synchronize(true); + } + AsyncInfoTy *get() { return this; } /// Implicit conversion to the __tgt_async_info which is used in the /// plugin interface. @@ -228,6 +245,13 @@ /// Synchronize all pending actions. /// + /// \note synchronization is performed when: + /// - LIBOMPTARGET_INTRA_THREAD_ASYNC is disabled + /// - LIBOMPTARGET_INTRA_THREAD_ASYNC is enabled and there is a memory + /// transfer (hasDataTransfer=true) + /// in the target region + /// - Synchronization is forced (ForceSync = true) + /// /// \note synchronization will be performance in a blocking or non-blocking /// manner, depending on the SyncType. /// @@ -235,7 +259,7 @@ /// functions will be executed once and unregistered afterwards. /// /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately. - int synchronize(); + int synchronize(bool ForceSync = false); /// Return a void* reference with a lifetime that is at least as long as this /// AsyncInfoTy object. The location can be used as intermediate buffer. @@ -278,6 +302,40 @@ bool isQueueEmpty() const; }; +/// Specialize DenseMapInfo for std::thread::id +template <> struct llvm::DenseMapInfo { + + static std::thread::id getEmptyKey() { return std::thread::id(); } + + static std::thread::id getTombstoneKey() { return std::thread::id(); } + + static unsigned getHashValue(const std::thread::id &Val) { + static std::hash hasher; + return hasher(Val); + } + + static bool isEqual(std::thread::id LHS, std::thread::id RHS) { + return LHS == RHS; + } +}; + +/// This structs allows for automatic asynchronous execution of target regions. +/// It associates an AsyncInfoTy object with a thread id to guarantee that +/// target tasks of the same thread are launched on the same queue. +class AsyncInfoMng { + llvm::DenseMap AsyncInfoM; + DeviceTy &Device; + std::mutex AsyncMtx; + +public: + AsyncInfoMng(DeviceTy &Device); + ~AsyncInfoMng(); + + /// Replaces AI with the associated AsyncInfo object when + /// LIBOMPTARGET_INTRA_THREAD_ASYNC is enabled. + AsyncInfoTy ®isterAI(AsyncInfoTy *AI); +}; + /// This struct is a record of non-contiguous information struct __tgt_target_non_contig { uint64_t Offset; Index: openmp/libomptarget/src/device.cpp =================================================================== --- openmp/libomptarget/src/device.cpp +++ openmp/libomptarget/src/device.cpp @@ -52,7 +52,8 @@ DeviceTy::DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), - HasPendingGlobals(false), PendingCtorsDtors(), PendingGlobalsMtx() {} + HasPendingGlobals(false), PendingCtorsDtors(), PendingGlobalsMtx(), + AIM(*this) {} DeviceTy::~DeviceTy() { if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)) Index: openmp/libomptarget/src/interface.cpp =================================================================== --- openmp/libomptarget/src/interface.cpp +++ openmp/libomptarget/src/interface.cpp @@ -100,7 +100,7 @@ DeviceTy &Device = *PM->Devices[DeviceId]; TargetAsyncInfoTy TargetAsyncInfo(Device); - AsyncInfoTy &AsyncInfo = TargetAsyncInfo; + AsyncInfoTy &AsyncInfo = Device.AIM.registerAI(TargetAsyncInfo.get()); int Rc = OFFLOAD_SUCCESS; Rc = TargetDataFunction(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, @@ -108,7 +108,7 @@ false /* FromMapper */); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AsyncInfo.synchronize(true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -270,13 +270,15 @@ DeviceTy &Device = *PM->Devices[DeviceId]; TargetAsyncInfoTy TargetAsyncInfo(Device); - AsyncInfoTy &AsyncInfo = TargetAsyncInfo; + AsyncInfoTy &AsyncInfo = Device.AIM.registerAI(TargetAsyncInfo.get()); int Rc = OFFLOAD_SUCCESS; Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo); - if (Rc == OFFLOAD_SUCCESS) + if (Rc == OFFLOAD_SUCCESS) { + /// Synchronization is not forced here. Rc = AsyncInfo.synchronize(); + } handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); @@ -342,7 +344,7 @@ TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit, LoopTripCount, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AsyncInfo.synchronize(true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel_replay unexpected failure!"); Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -20,27 +20,34 @@ #include #include +#include #include using llvm::SmallVector; -int AsyncInfoTy::synchronize() { +bool AsyncFlag = false; + +int AsyncInfoTy::synchronize(bool ForceSync) { int Result = OFFLOAD_SUCCESS; if (!isQueueEmpty()) { switch (SyncType) { case SyncTy::BLOCKING: - // If we have a queue we need to synchronize it now. - Result = Device.synchronize(*this); - assert(AsyncInfo.Queue == nullptr && - "The device plugin should have nulled the queue to indicate there " - "are no outstanding actions!"); + if (!AsyncFlag || (AsyncFlag && HasDataTransfer) || ForceSync) { + // If we have a queue we need to synchronize it now. + DP("Device %d synchronization\n", Device.DeviceID); + Result = Device.synchronize(*this); + HasDataTransfer = false; + assert( + AsyncInfo.Queue == nullptr && + "The device plugin should have nulled the queue to indicate there " + "are no outstanding actions!"); + } break; case SyncTy::NON_BLOCKING: Result = Device.queryAsync(*this); break; } } - // Run any pending post-processing function registered on this async object. if (Result == OFFLOAD_SUCCESS && isQueueEmpty()) Result = runPostProcessing(); @@ -73,6 +80,32 @@ bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; } +/// Async info manager +AsyncInfoMng::AsyncInfoMng(DeviceTy &Device) : Device(Device) {} + +AsyncInfoMng::~AsyncInfoMng() { + for (const auto &It : AsyncInfoM) + delete (It.second); + AsyncInfoM.clear(); +} + +AsyncInfoTy &AsyncInfoMng::registerAI(AsyncInfoTy *AI) { + if (!AsyncFlag) + return *AI; + + /// Get async info + std::lock_guard MapLock(AsyncMtx); + llvm::DenseMap::iterator it = + AsyncInfoM.find(std::this_thread::get_id()); + if (it != AsyncInfoM.end()) + return *(it->second); + /// Add new Async Info + AsyncInfoTy *AsyncInfo = + new AsyncInfoTy(Device, AsyncInfoTy::SyncTy::BLOCKING, false); + AsyncInfoM.insert(std::make_pair(std::this_thread::get_id(), AsyncInfo)); + return *AsyncInfo; +} + /* All begin addresses for partially mapped structs must be aligned, up to 16, * in order to ensure proper alignment of members. E.g. * @@ -246,7 +279,7 @@ } } // All constructors have been issued, wait for them now. - if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) + if (AsyncInfo.synchronize(true) != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; } Device.HasPendingGlobals = false; @@ -958,7 +991,7 @@ !TPR.Flags.IsHostPointer && DataSize != 0) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - + AsyncInfo.HasDataTransfer = true; // Wait for any previous transfer if an event is present. if (void *Event = TPR.getEntry()->getEvent()) { if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { Index: openmp/libomptarget/src/private.h =================================================================== --- openmp/libomptarget/src/private.h +++ openmp/libomptarget/src/private.h @@ -384,6 +384,7 @@ *TaskAsyncInfoPtr = nullptr; } + AsyncInfoTy *get() { return AsyncInfo; } operator AsyncInfoTy &() { return *AsyncInfo; } }; Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -42,6 +42,7 @@ PluginManager *PM; static char *ProfileTraceFile = nullptr; +extern bool AsyncFlag; __attribute__((constructor(101))) void init() { DP("Init target library!\n"); @@ -61,6 +62,12 @@ PM = new PluginManager(UseEventsForAtomicTransfers); ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); + + /// Asynchronous execution flag + if (char *EnvStr = getenv("LIBOMPTARGET_INTRA_THREAD_ASYNC")) + AsyncFlag = std::stoi(EnvStr) ? true : false; + DP("Asynchronous execution %s\n", AsyncFlag ? "Enabled" : "Disabled"); + // TODO: add a configuration option for time granularity if (ProfileTraceFile) timeTraceProfilerInitialize(500 /* us */, "libomptarget"); @@ -539,7 +546,7 @@ // Remove this library's entry from PendingCtorsDtors Device.PendingCtorsDtors.erase(Desc); // All constructors have been issued, wait for them now. - if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) + if (AsyncInfo.synchronize(true) != OFFLOAD_SUCCESS) DP("Failed synchronizing destructors kernels.\n"); } Device.PendingGlobalsMtx.unlock(); Index: openmp/libomptarget/test/offloading/automatic_async_exec.c =================================================================== --- /dev/null +++ openmp/libomptarget/test/offloading/automatic_async_exec.c @@ -0,0 +1,51 @@ +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -I %S +// RUN: env LIBOMPTARGET_INTRA_THREAD_ASYNC=1 \ +// RUN: %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda +// REQUIRES: nvptx64-nvidia-cuda + +#include +#include +#include + +int main () { + int a=100; + + #pragma omp target data map(tofrom: a) + { + printf("Result host 1 - %d \n", a); + #pragma omp target map(tofrom: a) + { + a = a + 3; + printf("Result target 1 - %d \n", a); + } + + printf("Result host 2 - %d \n", a); + + #pragma omp target + { + a = a + 5; + assert(a == 105); + printf("Result target 2 - %d \n", a); + } + + #pragma omp target update from(a) + assert(a == 103); + printf("Result host 3 - %d \n", a); + + for(int i=0; i<100; i++) { + a++; + } + assert(a == 203); + printf("Result host 4 - %d \n", a); + } + assert(a == 103); + + return 0; +} + +// CHECK: Result host 1 - 100 +// CHECK: Result host 2 - 100 +// CHECK: Result target 1 - 103 +// CHECK: Result target 2 - 105 +// CHECK: Result host 3 - 103 +// CHECK: Result host 4 - 203 \ No newline at end of file