diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h --- a/openmp/libomptarget/include/device.h +++ b/openmp/libomptarget/include/device.h @@ -61,8 +61,7 @@ struct StatesTy { StatesTy(uint64_t DRC, uint64_t HRC) : DynRefCount(DRC), HoldRefCount(HRC), - MayContainAttachedPointers(false), DeleteThreadId(std::thread::id()) { - } + MayContainAttachedPointers(false) {} /// The dynamic reference count is the standard reference count as of OpenMP /// 4.5. The hold reference count is an OpenMP extension for the sake of /// OpenACC support. @@ -101,13 +100,10 @@ /// should be written as void *Event[2]. void *Event = nullptr; - /// The id of the thread responsible for deleting this entry. This thread - /// set the reference count to zero *last*. Other threads might reuse the - /// entry while it is marked for deletion but not yet deleted (e.g., the - /// data is still being moved back). If another thread reuses the entry we - /// will have a non-zero reference count *or* the thread will have changed - /// this id, effectively taking over deletion responsibility. - std::thread::id DeleteThreadId; + /// Number of threads currently holding a reference to the entry at a + /// targetDataEnd. This is used to ensure that only the last thread that + /// references this entry will actually delete it. + int32_t DataEndThreadCount = 0; }; // When HostDataToTargetTy is used by std::set, std::set::iterator is const // use unique_ptr to make States mutable. @@ -148,13 +144,17 @@ /// Returns OFFLOAD_FAIL if something went wrong, OFFLOAD_SUCCESS otherwise. int addEventIfNecessary(DeviceTy &Device, AsyncInfoTy &AsyncInfo) const; - /// Indicate that the current thread expected to delete this entry. - void setDeleteThreadId() const { - States->DeleteThreadId = std::this_thread::get_id(); + /// Functions that manages the number of threads referencing the entry in a + /// targetDataEnd. + void incDataEndThreadCount() { ++States->DataEndThreadCount; } + + [[nodiscard]] int32_t decDataEndThreadCount() { + return --States->DataEndThreadCount; } - /// Return the thread id of the thread expected to delete this entry. - std::thread::id getDeleteThreadId() const { return States->DeleteThreadId; } + [[nodiscard]] int32_t getDataEndThreadCount() const { + return States->DataEndThreadCount; + } /// Set the event bound to this data map. void setEvent(void *Event) const { States->Event = Event; } @@ -377,20 +377,37 @@ void *getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, int64_t Size); - TargetPointerResultTy getTgtPtrBegin(void *HstPtrBegin, int64_t Size, - bool &IsLast, bool UpdateRefCount, - bool UseHoldRefCount, bool &IsHostPtr, - bool MustContain = false, - bool ForceDelete = false); - - /// Deallocate \p LR and remove the entry. Assume the total reference count is - /// zero and the calling thread is the deleting thread for \p LR. \p HDTTMap - /// ensure the caller holds exclusive access and can modify the map. Return \c - /// OFFLOAD_SUCCESS if the map entry existed, and return \c OFFLOAD_FAIL if - /// not. It is the caller's responsibility to skip calling this function if - /// the map entry is not expected to exist because \p HstPtrBegin uses shared - /// memory. - int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size); + /// Return the target pointer begin (where the data will be moved). + /// Used by targetDataBegin, targetDataEnd, targetDataUpdate and target. + /// - \p UpdateRefCount and \p UseHoldRefCount controls which and if the entry + /// reference counters will be decremented. + /// - \p MustContain enforces that the query must not extend beyond an already + /// mapped entry to be valid. + /// - \p ForceDelete deletes the entry regardless of its reference counting + /// (unless it is infinite). + /// - \p FromDataEnd tracks the number of threads referencing the entry at + /// targetDataEnd for delayed deletion purpose. + [[nodiscard]] TargetPointerResultTy + getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, + bool UpdateRefCount, bool UseHoldRefCount, bool &IsHostPtr, + bool MustContain = false, bool ForceDelete = false, + bool FromDataEnd = false); + + /// Remove the \p Entry from the data map. Expect the entry's total reference + /// count to be zero and the caller thread to be the last one using it. \p + /// HDTTMap ensure the caller holds exclusive access and can modify the map. + /// Return \c OFFLOAD_SUCCESS if the map entry existed, and return \c + /// OFFLOAD_FAIL if not. It is the caller's responsibility to skip calling + /// this function if the map entry is not expected to exist because \p + /// HstPtrBegin uses shared memory. + [[nodiscard]] int eraseMapEntry(HDTTMapAccessorTy &HDTTMap, + HostDataToTargetTy *Entry, int64_t Size); + + /// Deallocate the \p Entry from the device memory and delete it. Return \c + /// OFFLOAD_SUCCESS if the deallocation operations executed successfully, and + /// return \c OFFLOAD_FAIL otherwise. + [[nodiscard]] int deallocTgtPtrAndEntry(HostDataToTargetTy *Entry, + int64_t Size); int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); int disassociatePtr(void *HstPtrBegin); diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -359,13 +359,11 @@ return {{IsNew, IsHostPtr, IsPresent}, Entry, TargetPointer}; } -// Used by targetDataBegin, targetDataEnd, targetDataUpdate and target. -// Return the target pointer begin (where the data will be moved). -// Decrement the reference counter if called from targetDataEnd. TargetPointerResultTy DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, bool UpdateRefCount, bool UseHoldRefCount, - bool &IsHostPtr, bool MustContain, bool ForceDelete) { + bool &IsHostPtr, bool MustContain, bool ForceDelete, + bool FromDataEnd) { HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor(); void *TargetPointer = NULL; @@ -386,15 +384,18 @@ "expected correct IsLast prediction for reset"); } + // Increment the number of threads that is using the entry on a + // targetDataEnd, tracking the number of possible "deleters". A thread may + // come to own the entry deletion even if it was not the last one querying + // for it. Thus, we must track every query on targetDataEnds to ensure only + // the last thread that holds a reference to an entry actually deletes it. + if (FromDataEnd) + HT.incDataEndThreadCount(); + const char *RefCountAction; if (!UpdateRefCount) { RefCountAction = " (update suppressed)"; } else if (IsLast) { - // Mark the entry as to be deleted by this thread. Another thread might - // reuse the entry and take "ownership" for the deletion while this thread - // is waiting for data transfers. That is fine and the current thread will - // simply skip the deletion step then. - HT.setDeleteThreadId(); HT.decRefCount(UseHoldRefCount); assert(HT.getTotalRefCount() == 0 && "Expected zero reference count when deletion is scheduled"); @@ -450,41 +451,42 @@ return NULL; } -int DeviceTy::deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, - int64_t Size) { - // Check if the pointer is contained in any sub-nodes. - if (!(LR.Flags.IsContained || LR.Flags.ExtendsBefore || - LR.Flags.ExtendsAfter)) { - REPORT("Section to delete (hst addr " DPxMOD ") does not exist in the" - " allocated memory\n", - DPxPTR(LR.Entry->HstPtrBegin)); - return OFFLOAD_FAIL; - } - - auto &HT = *LR.Entry; - // Verify this thread is still in charge of deleting the entry. - assert(HT.getTotalRefCount() == 0 && - HT.getDeleteThreadId() == std::this_thread::get_id() && +int DeviceTy::eraseMapEntry(HDTTMapAccessorTy &HDTTMap, + HostDataToTargetTy *Entry, int64_t Size) { + assert(Entry && "Trying to delete a null entry from the HDTT map."); + assert(Entry->getTotalRefCount() == 0 && Entry->getDataEndThreadCount() == 0 && "Trying to delete entry that is in use or owned by another thread."); - DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n", - DPxPTR(HT.TgtPtrBegin), Size); - deleteData((void *)HT.TgtPtrBegin); INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID, "Removing map entry with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", Size=%" PRId64 ", Name=%s\n", - DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size, - (HT.HstPtrName) ? getNameFromMapping(HT.HstPtrName).c_str() : "unknown"); - void *Event = LR.Entry->getEvent(); - HDTTMap->erase(LR.Entry); - delete LR.Entry; + DPxPTR(Entry->HstPtrBegin), DPxPTR(Entry->TgtPtrBegin), Size, + (Entry->HstPtrName) ? getNameFromMapping(Entry->HstPtrName).c_str() + : "unknown"); + + if (HDTTMap->erase(Entry) == 0) { + REPORT("Trying to remove a non-existent map entry\n"); + return OFFLOAD_FAIL; + } - int Ret = OFFLOAD_SUCCESS; + return OFFLOAD_SUCCESS; +} + +int DeviceTy::deallocTgtPtrAndEntry(HostDataToTargetTy *Entry, int64_t Size) { + assert(Entry && "Trying to deallocate a null entry."); + + DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n", + DPxPTR(Entry->TgtPtrBegin), Size); + + void *Event = Entry->getEvent(); if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) { REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event)); - Ret = OFFLOAD_FAIL; + return OFFLOAD_FAIL; } + int Ret = deleteData((void *)Entry->TgtPtrBegin); + delete Entry; + return Ret; } diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -751,17 +751,16 @@ /// The mapping type (bitfield). int64_t ArgType; + /// Index of the argument in the data mapping scheme. + int32_t ArgIndex; + /// The target pointer information. TargetPointerResultTy TPR; - /// Are we expecting to delete this entry or not. Even if set, we might not - /// delete the entry if another thread reused the entry in the meantime. - bool DelEntry; - - PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, bool DelEntry, - TargetPointerResultTy TPR) - : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), TPR(TPR), - DelEntry(DelEntry) {} + PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, + int32_t ArgIndex, TargetPointerResultTy TPR) + : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), + ArgIndex(ArgIndex), TPR(TPR) {} }; /// Apply \p CB to the shadow map pointer entries in the range \p Begin, to @@ -811,42 +810,55 @@ /// data end. This includes the update of pointers at the host and removal of /// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS /// according to the successfulness of the operations. -static int +[[nodiscard]] static int postProcessingTargetDataEnd(DeviceTy *Device, SmallVector EntriesInfo, - void *FromMapperBase) { + bool FromMapper) { int Ret = OFFLOAD_SUCCESS; + void *FromMapperBase = nullptr; + + for (auto &[HstPtrBegin, DataSize, ArgType, ArgIndex, TPR] : EntriesInfo) { + bool DelEntry = !TPR.isHostPointer(); + + // If the last element from the mapper (for end transfer args comes in + // reverse order), do not remove the partial entry, the parent struct still + // exists. + if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) && + !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { + DelEntry = false; // protect parent struct from being deallocated + } + + if (DelEntry && FromMapper && ArgIndex == 0) { + DelEntry = false; + FromMapperBase = HstPtrBegin; + } - for (PostProcessingInfo &Info : EntriesInfo) { // If we marked the entry to be deleted we need to verify no other // thread reused it by now. If deletion is still supposed to happen by // this thread LR will be set and exclusive access to the HDTT map // will avoid another thread reusing the entry now. Note that we do - // not request (exclusive) access to the HDTT map if Info.DelEntry is + // not request (exclusive) access to the HDTT map if DelEntry is // not set. - LookupResult LR; DeviceTy::HDTTMapAccessorTy HDTTMap = - Device->HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry); - - if (Info.DelEntry) { - LR = Device->lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize); - if (LR.Entry->getTotalRefCount() != 0 || - LR.Entry->getDeleteThreadId() != std::this_thread::get_id()) { - // The thread is not in charge of deletion anymore. Give up access - // to the HDTT map and unset the deletion flag. - HDTTMap.destroy(); - Info.DelEntry = false; - } + Device->HostDataToTargetMap.getExclusiveAccessor(!DelEntry); + + const bool IsNotLastUser = TPR.Entry->decDataEndThreadCount() != 0; + if (DelEntry && (TPR.Entry->getTotalRefCount() != 0 || IsNotLastUser)) { + // The thread is not in charge of deletion anymore. Give up access + // to the HDTT map and unset the deletion flag. + HDTTMap.destroy(); + DelEntry = false; } // If we copied back to the host a struct/array containing pointers, // we need to restore the original host pointer values from their // shadow copies. If the struct is going to be deallocated, remove any // remaining shadow pointer entries for this struct. + const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM; auto CB = [&](ShadowPtrListTy::iterator &Itr) { // If we copied the struct to the host, we need to restore the // pointer. - if (Info.ArgType & OMP_TGT_MAPTYPE_FROM) { + if (HasFrom) { void **ShadowHstPtrAddr = (void **)Itr->first; *ShadowHstPtrAddr = Itr->second.HstPtrVal; DP("Restoring original host pointer value " DPxMOD " for host " @@ -854,7 +866,7 @@ DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); } // If the struct is to be deallocated, remove the shadow entry. - if (Info.DelEntry) { + if (DelEntry) { DP("Removing shadow pointer " DPxMOD "\n", DPxPTR((void **)Itr->first)); auto OldItr = Itr; Itr++; @@ -864,19 +876,20 @@ } return OFFLOAD_SUCCESS; }; - applyToShadowMapEntries(*Device, CB, Info.HstPtrBegin, Info.DataSize, - Info.TPR); + applyToShadowMapEntries(*Device, CB, HstPtrBegin, DataSize, TPR); + + if (!DelEntry || (FromMapperBase && FromMapperBase == HstPtrBegin)) + continue; // If we are deleting the entry the DataMapMtx is locked and we own // the entry. - if (Info.DelEntry) { - if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin) - Ret = Device->deallocTgtPtr(HDTTMap, LR, Info.DataSize); - - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Deallocating data from device failed.\n"); - break; - } + Ret = Device->eraseMapEntry(HDTTMap, TPR.Entry, DataSize); + // Entry is already remove from the map, we can unlock it now. + HDTTMap.destroy(); + Ret |= Device->deallocTgtPtrAndEntry(TPR.Entry, DataSize); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Deallocating data from device failed.\n"); + break; } } @@ -950,7 +963,7 @@ // If PTR_AND_OBJ, HstPtrBegin is address of pointee TargetPointerResultTy TPR = Device.getTgtPtrBegin( HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr, - !IsImplicit, ForceDelete); + !IsImplicit, ForceDelete, /*FromDataEnd=*/true); void *TgtPtrBegin = TPR.TargetPointer; if (!TPR.isPresent() && !TPR.isHostPointer() && (DataSize || HasPresentModifier)) { @@ -991,61 +1004,42 @@ if (!TPR.isPresent()) continue; - bool DelEntry = IsLast; - - // If the last element from the mapper (for end transfer args comes in - // reverse order), do not remove the partial entry, the parent struct still - // exists. - if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && - !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { - DelEntry = false; // protect parent struct from being deallocated - } - - if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { - // Move data back to the host - if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { - bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; - if ((Always || IsLast) && !IsHostPtr) { - DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - - std::lock_guard LG(*TPR.Entry); - // Wait for any previous transfer if an event is present. - if (void *Event = TPR.Entry->getEvent()) { - if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { - REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event)); - return OFFLOAD_FAIL; - } - } - - Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, - AsyncInfo); - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Copying data from device failed.\n"); - return OFFLOAD_FAIL; - } - - // As we are expecting to delete the entry the d2h copy might race - // with another one that also tries to delete the entry. This happens - // as the entry can be reused and the reuse might happen after the - // copy-back was issued but before it completed. Since the reuse might - // also copy-back a value we would race. - if (IsLast) { - if (TPR.Entry->addEventIfNecessary(Device, AsyncInfo) != - OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - } + // Move data back to the host + const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; + const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM; + if (HasFrom && (HasAlways || IsLast) && !IsHostPtr) { + DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", + DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + + std::lock_guard LG(*TPR.Entry); + // Wait for any previous transfer if an event is present. + if (void *Event = TPR.Entry->getEvent()) { + if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { + REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event)); + return OFFLOAD_FAIL; } } - if (DelEntry && FromMapper && I == 0) { - DelEntry = false; - FromMapperBase = HstPtrBegin; + + Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Copying data from device failed.\n"); + return OFFLOAD_FAIL; } - // Add pointer to the buffer for post-synchronize processing. - PostProcessingPtrs.emplace_back(HstPtrBegin, DataSize, ArgTypes[I], - DelEntry && !IsHostPtr, TPR); + // As we are expecting to delete the entry the d2h copy might race + // with another one that also tries to delete the entry. This happens + // as the entry can be reused and the reuse might happen after the + // copy-back was issued but before it completed. Since the reuse might + // also copy-back a value we would race. + if (IsLast) { + if (TPR.Entry->addEventIfNecessary(Device, AsyncInfo) != + OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + } } + + // Add pointer to the buffer for post-synchronize processing. + PostProcessingPtrs.emplace_back(HstPtrBegin, DataSize, ArgTypes[I], I, TPR); } // Add post-processing functions diff --git a/openmp/libomptarget/test/mapping/map_back_race.cpp b/openmp/libomptarget/test/mapping/map_back_race.cpp --- a/openmp/libomptarget/test/mapping/map_back_race.cpp +++ b/openmp/libomptarget/test/mapping/map_back_race.cpp @@ -2,19 +2,6 @@ // Taken from https://github.com/llvm/llvm-project/issues/54216 -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: amdgcn-amd-amdhsa -// UNSUPPORTED: amdgcn-amd-amdhsa-LTO -// UNSUPPORTED: powerpc64le-ibm-linux-gnu -// UNSUPPORTED: powerpc64le-ibm-linux-gnu-LTO -// UNSUPPORTED: powerpc64-ibm-linux-gnu -// UNSUPPORTED: powerpc64-ibm-linux-gnu-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO - #include #include #include