diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h --- a/openmp/libomptarget/include/device.h +++ b/openmp/libomptarget/include/device.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "ExclusiveAccess.h" @@ -60,7 +61,8 @@ struct StatesTy { StatesTy(uint64_t DRC, uint64_t HRC) : DynRefCount(DRC), HoldRefCount(HRC), - MayContainAttachedPointers(false) {} + MayContainAttachedPointers(false), DeleteThreadId(std::thread::id()) { + } /// The dynamic reference count is the standard reference count as of OpenMP /// 4.5. The hold reference count is an OpenMP extension for the sake of /// OpenACC support. @@ -98,6 +100,14 @@ /// mechanism for D2H, and if the event cannot be shared between them, Event /// should be written as void *Event[2]. void *Event = nullptr; + + /// The id of the thread responsible for deleting this entry. This thread + /// set the reference count to zero *last*. Other threads might reuse the + /// entry while it is marked for deletion but not yet deleted (e.g., the + /// data is still being moved back). If another thread reuses the entry we + /// will have a non-zero reference count *or* the thread will have changed + /// this id, effectively taking over deletion responsibility. + std::thread::id DeleteThreadId; }; // When HostDataToTargetTy is used by std::set, std::set::iterator is const // use unique_ptr to make States mutable. @@ -138,6 +148,14 @@ /// Returns OFFLOAD_FAIL if something went wrong, OFFLOAD_SUCCESS otherwise. int addEventIfNecessary(DeviceTy &Device, AsyncInfoTy &AsyncInfo) const; + /// Indicate that the current thread expected to delete this entry. + void setDeleteThreadId() const { + States->DeleteThreadId = std::this_thread::get_id(); + } + + /// Return the thread id of the thread expected to delete this entry. + std::thread::id getDeleteThreadId() const { return States->DeleteThreadId; } + /// Set the event bound to this data map. void setEvent(void *Event) const { States->Event = Event; } @@ -172,7 +190,7 @@ if (ThisRefCount > 0) --ThisRefCount; else - assert(OtherRefCount > 0 && "total refcount underflow"); + assert(OtherRefCount >= 0 && "total refcount underflow"); } return getTotalRefCount(); } @@ -362,14 +380,16 @@ bool UseHoldRefCount, bool &IsHostPtr, bool MustContain = false, bool ForceDelete = false); - /// For the map entry for \p HstPtrBegin, decrement the reference count - /// specified by \p HasHoldModifier and, if the the total reference count is - /// then zero, deallocate the corresponding device storage and remove the map - /// entry. Return \c OFFLOAD_SUCCESS if the map entry existed, and return - /// \c OFFLOAD_FAIL if not. It is the caller's responsibility to skip calling - /// this function if the map entry is not expected to exist because - /// \p HstPtrBegin uses shared memory. - int deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool HasHoldModifier); + + /// Deallocate \p LR and remove the entry. Assume the total reference count is + /// zero and the calling thread is the deleting thread for \p LR. \p HDTTMap + /// ensure the caller holds exclusive access and can modify the map. Return \c + /// OFFLOAD_SUCCESS if the map entry existed, and return \c OFFLOAD_FAIL if + /// not. It is the caller's responsibility to skip calling this function if + /// the map entry is not expected to exist because \p HstPtrBegin uses shared + /// memory. + int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size); + int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); int disassociatePtr(void *HstPtrBegin); diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -20,6 +20,7 @@ #include #include #include +#include int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device, AsyncInfoTy &AsyncInfo) const { @@ -207,9 +208,10 @@ ((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && IsImplicit)) { auto &HT = *LR.Entry; const char *RefCountAction; - assert(HT.getTotalRefCount() > 0 && "expected existing RefCount > 0"); if (UpdateRefCount) { - // After this, RefCount > 1. + // After this, reference count >= 1. If the reference count was 0 but the + // entry was still there we can reuse the data on the device and avoid a + // new submission. HT.incRefCount(HasHoldModifier); RefCountAction = " (incremented)"; } else { @@ -349,27 +351,30 @@ if (lr.Flags.IsContained || (!MustContain && (lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter))) { auto &HT = *lr.Entry; - // We do not zero the total reference count here. deallocTgtPtr does that - // atomically with removing the mapping. Otherwise, before this thread - // removed the mapping in deallocTgtPtr, another thread could retrieve the - // mapping, increment and decrement back to zero, and then both threads - // would try to remove the mapping, resulting in a double free. IsLast = HT.decShouldRemove(UseHoldRefCount, ForceDelete); - const char *RefCountAction; - if (!UpdateRefCount) { - RefCountAction = " (update suppressed)"; - } else if (ForceDelete) { + + if (ForceDelete) { HT.resetRefCount(UseHoldRefCount); assert(IsLast == HT.decShouldRemove(UseHoldRefCount) && "expected correct IsLast prediction for reset"); - if (IsLast) - RefCountAction = " (reset, deferred final decrement)"; - else { - HT.decRefCount(UseHoldRefCount); - RefCountAction = " (reset)"; - } + } + + const char *RefCountAction; + if (!UpdateRefCount) { + RefCountAction = " (update suppressed)"; } else if (IsLast) { - RefCountAction = " (deferred final decrement)"; + // Mark the entry as to be deleted by this thread. Another thread might + // reuse the entry and take "ownership" for the deletion while this thread + // is waiting for data transfers. That is fine and the current thread will + // simply skip the deletion step then. + HT.setDeleteThreadId(); + HT.decRefCount(UseHoldRefCount); + assert(HT.getTotalRefCount() == 0 && + "Expected zero reference count when deletion is scheduled"); + if (ForceDelete) + RefCountAction = " (reset, delayed deletion)"; + else + RefCountAction = " (decremented, delayed deletion)"; } else { HT.decRefCount(UseHoldRefCount); RefCountAction = " (decremented)"; @@ -411,37 +416,38 @@ return NULL; } -int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, - bool HasHoldModifier) { - HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor(); - +int DeviceTy::deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, + int64_t Size) { // Check if the pointer is contained in any sub-nodes. - int Ret = OFFLOAD_SUCCESS; - LookupResult lr = lookupMapping(HDTTMap, HstPtrBegin, Size); - if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { - auto &HT = *lr.Entry; - if (HT.decRefCount(HasHoldModifier) == 0) { - DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n", - DPxPTR(HT.TgtPtrBegin), Size); - deleteData((void *)HT.TgtPtrBegin); - INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID, - "Removing map entry with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD - ", Size=%" PRId64 ", Name=%s\n", - DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size, - (HT.HstPtrName) ? getNameFromMapping(HT.HstPtrName).c_str() - : "unknown"); - void *Event = lr.Entry->getEvent(); - HDTTMap->erase(lr.Entry); - delete lr.Entry; - if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) { - REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event)); - Ret = OFFLOAD_FAIL; - } - } - } else { + if (!(LR.Flags.IsContained || LR.Flags.ExtendsBefore || + LR.Flags.ExtendsAfter)) { REPORT("Section to delete (hst addr " DPxMOD ") does not exist in the" " allocated memory\n", - DPxPTR(HstPtrBegin)); + DPxPTR(LR.Entry->HstPtrBegin)); + return OFFLOAD_FAIL; + } + + auto &HT = *LR.Entry; + // Verify this thread is still in charge of deleting the entry. + assert(HT.getTotalRefCount() == 0 && + HT.getDeleteThreadId() == std::this_thread::get_id() && + "Trying to delete entry that is in use or owned by another thread."); + + DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n", + DPxPTR(HT.TgtPtrBegin), Size); + deleteData((void *)HT.TgtPtrBegin); + INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID, + "Removing map entry with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD + ", Size=%" PRId64 ", Name=%s\n", + DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size, + (HT.HstPtrName) ? getNameFromMapping(HT.HstPtrName).c_str() : "unknown"); + void *Event = LR.Entry->getEvent(); + HDTTMap->erase(LR.Entry); + delete LR.Entry; + + int Ret = OFFLOAD_SUCCESS; + if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) { + REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event)); Ret = OFFLOAD_FAIL; } diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -607,17 +607,29 @@ namespace { /// This structure contains information to deallocate a target pointer, aka. -/// used to call the function \p DeviceTy::deallocTgtPtr. -struct DeallocTgtPtrInfo { +/// used to fix up the shadow map and potentially delete the entry from the +/// mapping table via \p DeviceTy::deallocTgtPtr. +struct PostProcessingInfo { /// Host pointer used to look up into the map table void *HstPtrBegin; + /// Size of the data int64_t DataSize; - /// Whether it has \p ompx_hold modifier - bool HasHoldModifier; - DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier) - : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {} + /// The mapping type (bitfield). + int64_t ArgType; + + /// The target pointer information. + TargetPointerResultTy TPR; + + /// Are we expecting to delete this entry or not. Even if set, we might not + /// delete the entry if another thread reused the entry in the meantime. + bool DelEntry; + + PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, bool DelEntry, + TargetPointerResultTy TPR) + : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), TPR(TPR), + DelEntry(DelEntry) {} }; /// Apply \p CB to the shadow map pointer entries in the range \p Begin, to @@ -668,7 +680,7 @@ int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { int Ret; - std::vector DeallocTgtPtrs; + std::vector PostProcessingPtrs; void *FromMapperBase = nullptr; // process each input. for (int32_t I = ArgNum - 1; I >= 0; --I) { @@ -786,12 +798,33 @@ if ((Always || IsLast) && !IsHostPtr) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + + std::lock_guard LG(*TPR.Entry); + // Wait for any previous transfer if an event is present. + if (void *Event = TPR.Entry->getEvent()) { + if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { + REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event)); + return OFFLOAD_FAIL; + } + } + Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data from device failed.\n"); return OFFLOAD_FAIL; } + + // As we are expecting to delete the entry the d2h copy might race + // with another one that also tries to delete the entry. This happens + // as the entry can be reused and the reuse might happen after the + // copy-back was issued but before it completed. Since the reuse might + // also copy-back a value we would race. + if (IsLast) { + if (TPR.Entry->addEventIfNecessary(Device, AsyncInfo) != + OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + } } } if (DelEntry && FromMapper && I == 0) { @@ -799,38 +832,9 @@ FromMapperBase = HstPtrBegin; } - // If we copied back to the host a struct/array containing pointers, we - // need to restore the original host pointer values from their shadow - // copies. If the struct is going to be deallocated, remove any remaining - // shadow pointer entries for this struct. - auto CB = [&](ShadowPtrListTy::iterator &Itr) { - // If we copied the struct to the host, we need to restore the pointer. - if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { - void **ShadowHstPtrAddr = (void **)Itr->first; - // Wait for device-to-host memcopies for whole struct to complete, - // before restoring the correct host pointer. - if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - *ShadowHstPtrAddr = Itr->second.HstPtrVal; - DP("Restoring original host pointer value " DPxMOD " for host " - "pointer " DPxMOD "\n", - DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); - } - // If the struct is to be deallocated, remove the shadow entry. - if (DelEntry) { - DP("Removing shadow pointer " DPxMOD "\n", - DPxPTR((void **)Itr->first)); - Itr = Device.ShadowPtrMap.erase(Itr); - } else { - ++Itr; - } - return OFFLOAD_SUCCESS; - }; - applyToShadowMapEntries(Device, CB, HstPtrBegin, DataSize, TPR); - - // Add pointer to the buffer for later deallocation - if (DelEntry && !IsHostPtr) - DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasHoldModifier); + // Add pointer to the buffer for post-synchronize processing. + PostProcessingPtrs.emplace_back(HstPtrBegin, DataSize, ArgTypes[I], + DelEntry && !IsHostPtr, TPR); } } @@ -843,18 +847,66 @@ return OFFLOAD_FAIL; // Deallocate target pointer - for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) { - if (FromMapperBase && FromMapperBase == Info.HstPtrBegin) - continue; - Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize, - Info.HasHoldModifier); - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Deallocating data from device failed.\n"); - return OFFLOAD_FAIL; + for (PostProcessingInfo &Info : PostProcessingPtrs) { + // If we marked the entry to be deleted we need to verify no other thread + // reused it by now. If deletion is still supposed to happen by this thread + // LR will be set and exclusive access to the HDTT map will avoid another + // thread reusing the entry now. Note that we do not request (exclusive) + // access to the HDTT map if Info.DelEntry is not set. + LookupResult LR; + DeviceTy::HDTTMapAccessorTy HDTTMap = + Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry); + + if (Info.DelEntry) { + LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize); + if (LR.Entry->getTotalRefCount() != 0 || + LR.Entry->getDeleteThreadId() != std::this_thread::get_id()) { + // The thread is not in charge of deletion anymore. Give up access to + // the HDTT map and unset the deletion flag. + HDTTMap.destroy(); + Info.DelEntry = false; + } + } + + // If we copied back to the host a struct/array containing pointers, we + // need to restore the original host pointer values from their shadow + // copies. If the struct is going to be deallocated, remove any remaining + // shadow pointer entries for this struct. + auto CB = [&](ShadowPtrListTy::iterator &Itr) { + // If we copied the struct to the host, we need to restore the pointer. + if (Info.ArgType & OMP_TGT_MAPTYPE_FROM) { + void **ShadowHstPtrAddr = (void **)Itr->first; + *ShadowHstPtrAddr = Itr->second.HstPtrVal; + DP("Restoring original host pointer value " DPxMOD " for host " + "pointer " DPxMOD "\n", + DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); + } + // If the struct is to be deallocated, remove the shadow entry. + if (Info.DelEntry) { + DP("Removing shadow pointer " DPxMOD "\n", DPxPTR((void **)Itr->first)); + Itr = Device.ShadowPtrMap.erase(Itr); + } else { + ++Itr; + } + return OFFLOAD_SUCCESS; + }; + applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize, + Info.TPR); + + // If we are deleting the entry the DataMapMtx is locked and we own the + // entry. + if (Info.DelEntry) { + if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin) + Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize); + + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Deallocating data from device failed.\n"); + break; + } } } - return OFFLOAD_SUCCESS; + return Ret; } static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase, diff --git a/openmp/libomptarget/test/mapping/map_back_race.cpp b/openmp/libomptarget/test/mapping/map_back_race.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/mapping/map_back_race.cpp @@ -0,0 +1,32 @@ +// RUN: %libomptarget-compile-run-and-check-generic + +// Taken from https://github.com/llvm/llvm-project/issues/54216 + +#include +#include +#include + +bool almost_equal(float x, float gold, float rel_tol = 1e-09, + float abs_tol = 0.0) { + return std::abs(x - gold) <= + std::max(rel_tol * std::max(std::abs(x), std::abs(gold)), abs_tol); +} +void test_parallel_for__target() { + const int N0{32768}; + const float expected_value{N0}; + float counter_N0{}; +#pragma omp parallel for + for (int i0 = 0; i0 < N0; i0++) { +#pragma omp target map(tofrom : counter_N0) + { +#pragma omp atomic update + counter_N0 = counter_N0 + 1.; + } + } + if (!almost_equal(counter_N0, expected_value, 0.01)) { + std::cerr << "Expected: " << expected_value << " Got: " << counter_N0 + << std::endl; + std::exit(112); + } +} +int main() { test_parallel_for__target(); }