diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -393,7 +393,10 @@ // equivalence class, the iteration order is deterministic. for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end(); MI != ME; ++MI) { - unsigned Pointer = PositionMap[MI->getPointer()]; + auto PointerI = PositionMap.find(MI->getPointer()); + assert(PointerI != PositionMap.end() && + "pointer in equivalence class not found in PositionMap"); + unsigned Pointer = PointerI->second; bool Merged = false; // Mark this pointer as seen. Seen.insert(Pointer); @@ -726,52 +729,55 @@ SmallVector Retries; + // First, count how many write and read accesses are in the alias set. Also + // collect MemAccessInfos for later. + SmallVector AccessInfos; for (auto A : AS) { Value *Ptr = A.getValue(); bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); - MemAccessInfo Access(Ptr, IsWrite); if (IsWrite) ++NumWritePtrChecks; else ++NumReadPtrChecks; + AccessInfos.emplace_back(Ptr, IsWrite); + } + // We do not need runtime checks for this alias set, if there are no writes + // or a single write and no reads. + if (NumWritePtrChecks == 0 || + (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) { + assert((AS.size() <= 1 || + all_of(AS, + [this](auto AC) { + MemAccessInfo AccessWrite(AC.getValue(), true); + return DepCands.findValue(AccessWrite) == DepCands.end(); + })) && + "Can only skip updating CanDoRT below, if all entries in AS " + "are reads or there is at most 1 entry"); + continue; + } + + for (auto &Access : AccessInfos) { if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, TheLoop, RunningDepId, ASId, ShouldCheckWrap, false)) { - LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" << *Ptr << '\n'); + LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" + << *Access.getPointer() << '\n'); Retries.push_back(Access); CanDoAliasSetRT = false; } } - // If we have at least two writes or one write and a read then we need to - // check them. But there is no need to checks if there is only one - // dependence set for this alias set. - // // Note that this function computes CanDoRT and MayNeedRTCheck // independently. For example CanDoRT=false, MayNeedRTCheck=false means that // we have a pointer for which we couldn't find the bounds but we don't // actually need to emit any checks so it does not matter. - bool NeedsAliasSetRTCheck = false; - if (!(IsDepCheckNeeded && CanDoAliasSetRT && RunningDepId == 2)) { - NeedsAliasSetRTCheck = (NumWritePtrChecks >= 2 || - (NumReadPtrChecks >= 1 && NumWritePtrChecks >= 1)); - // For alias sets without at least 2 writes or 1 write and 1 read, there - // is no need to generate RT checks and CanDoAliasSetRT for this alias set - // does not impact whether runtime checks can be generated. - if (!NeedsAliasSetRTCheck) { - assert((AS.size() <= 1 || - all_of(AS, - [this](auto AC) { - MemAccessInfo AccessWrite(AC.getValue(), true); - return DepCands.findValue(AccessWrite) == - DepCands.end(); - })) && - "Can only skip updating CanDoRT below, if all entries in AS " - "are reads or there is at most 1 entry"); - continue; - } - } + // + // We need runtime checks for this alias set, if there are at least 2 + // dependence sets (in which case RunningDepId > 2) or if we need to re-try + // any bound checks (because in that case the number of dependence sets is + // incomplete). + bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty(); // We need to perform run-time alias checks, but some pointers had bounds // that couldn't be checked. diff --git a/llvm/test/Transforms/LoopLoadElim/pr46854-adress-spaces.ll b/llvm/test/Transforms/LoopLoadElim/pr46854-adress-spaces.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopLoadElim/pr46854-adress-spaces.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: amdgpu-registered-target + +; RUN: opt -globals-aa -loop-load-elim -S %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7" +target triple = "amdgcn-amd-amdhsa" + +%struct.foo = type { %struct.pluto, i8, i8*, i32 } +%struct.pluto = type { i32, i32, i32, %struct.wombat*, i32, i32, i32 } +%struct.wombat = type { %struct.barney } +%struct.barney = type { <2 x float> } + +@global = external protected local_unnamed_addr addrspace(4) externally_initialized global %struct.foo, align 8 +@global.1 = internal unnamed_addr addrspace(3) constant [4000 x float] undef, align 16 + +; Function Attrs: nounwind +define protected amdgpu_kernel void @widget(i32 %arg, i32 %arg1) #0 { +; CHECK-LABEL: @widget( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds [4000 x float], [4000 x float] addrspace(3)* @global.1, i32 0, i32 [[ARG:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = load %struct.wombat*, %struct.wombat* addrspace(4)* getelementptr inbounds (%struct.foo, [[STRUCT_FOO:%.*]] addrspace(4)* @global, i64 0, i32 0, i32 3), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_WOMBAT:%.*]], %struct.wombat* [[TMP2]], i64 undef, i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast %struct.barney* [[TMP3]] to i64* +; CHECK-NEXT: br label [[BB5:%.*]] +; CHECK: bb5.loopexit: +; CHECK-NEXT: br label [[BB5]] +; CHECK: bb5: +; CHECK-NEXT: br label [[BB6:%.*]] +; CHECK: bb6: +; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ undef, [[BB5]] ], [ [[TMP19:%.*]], [[BB6]] ] +; CHECK-NEXT: [[TMP8:%.*]] = mul nsw i32 [[TMP7]], undef +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], undef +; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_WOMBAT]], %struct.wombat* [[TMP2]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast %struct.wombat* [[TMP11]] to i64* +; CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = srem i32 1, [[ARG1:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw i32 [[TMP14]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4000 x float], [4000 x float] addrspace(3)* @global.1, i32 0, i32 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = load float, float addrspace(3)* [[TMP16]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, float addrspace(3)* [[TMP]], align 4 +; CHECK-NEXT: store i64 [[TMP13]], i64* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP19]] = add nsw i32 [[TMP7]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = icmp slt i32 [[TMP7]], 3 +; CHECK-NEXT: br i1 [[TMP20]], label [[BB6]], label [[BB5_LOOPEXIT:%.*]] +; +bb: + %tmp = getelementptr inbounds [4000 x float], [4000 x float] addrspace(3)* @global.1, i32 0, i32 %arg + %tmp2 = load %struct.wombat*, %struct.wombat* addrspace(4)* getelementptr inbounds (%struct.foo, %struct.foo addrspace(4)* @global, i64 0, i32 0, i32 3), align 8 + %tmp3 = getelementptr inbounds %struct.wombat, %struct.wombat* %tmp2, i64 undef, i32 0 + %tmp4 = bitcast %struct.barney* %tmp3 to i64* + br label %bb5 + +bb5: ; preds = %bb6, %bb + br label %bb6 + +bb6: ; preds = %bb6, %bb5 + %tmp7 = phi i32 [ undef, %bb5 ], [ %tmp19, %bb6 ] + %tmp8 = mul nsw i32 %tmp7, undef + %tmp9 = add i32 %tmp8, undef + %tmp10 = sext i32 %tmp9 to i64 + %tmp11 = getelementptr inbounds %struct.wombat, %struct.wombat* %tmp2, i64 %tmp10 + %tmp12 = bitcast %struct.wombat* %tmp11 to i64* + %tmp13 = load i64, i64* %tmp12, align 8 + %tmp14 = srem i32 1, %arg1 + %tmp15 = add nuw nsw i32 %tmp14, 1 + %tmp16 = getelementptr inbounds [4000 x float], [4000 x float] addrspace(3)* @global.1, i32 0, i32 %tmp15 + %tmp17 = load float, float addrspace(3)* %tmp16, align 4 + %tmp18 = load float, float addrspace(3)* %tmp, align 4 + store i64 %tmp13, i64* %tmp4, align 8 + %tmp19 = add nsw i32 %tmp7, 1 + %tmp20 = icmp slt i32 %tmp7, 3 + br i1 %tmp20, label %bb6, label %bb5 +} + +attributes #0 = { nounwind }