diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -410,9 +410,8 @@ /// according to the assumptions that we've made during the analysis. /// The method might also version the pointer stride according to \p Strides, /// and add new predicates to \p PSE. - void insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, - unsigned ASId, const ValueToValueMap &Strides, - PredicatedScalarEvolution &PSE); + void insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, bool WritePtr, + unsigned DepSetId, unsigned ASId, PredicatedScalarEvolution &PSE); /// No run-time memory checking is necessary. bool empty() const { return Pointers.empty(); } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -47,6 +47,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" @@ -66,6 +67,7 @@ #include using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "loop-accesses" @@ -189,12 +191,11 @@ /// /// There is no conflict when the intervals are disjoint: /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) -void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr, - unsigned DepSetId, unsigned ASId, - const ValueToValueMap &Strides, +void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, + bool WritePtr, unsigned DepSetId, + unsigned ASId, PredicatedScalarEvolution &PSE) { - // Get the stride replaced scev. - const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); + const SCEV *Sc = PtrExpr; ScalarEvolution *SE = PSE.getSE(); const SCEV *ScStart; @@ -371,9 +372,11 @@ unsigned TotalComparisons = 0; - DenseMap PositionMap; - for (unsigned Index = 0; Index < Pointers.size(); ++Index) - PositionMap[Pointers[Index].PointerValue] = Index; + DenseMap> PositionMap; + for (unsigned Index = 0; Index < Pointers.size(); ++Index) { + auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}}); + Iter.first->second.push_back(Index); + } // We need to keep track of what pointers we've already seen so we // don't process them twice. @@ -404,34 +407,35 @@ auto PointerI = PositionMap.find(MI->getPointer()); assert(PointerI != PositionMap.end() && "pointer in equivalence class not found in PositionMap"); - unsigned Pointer = PointerI->second; - bool Merged = false; - // Mark this pointer as seen. - Seen.insert(Pointer); - - // Go through all the existing sets and see if we can find one - // which can include this pointer. - for (RuntimeCheckingPtrGroup &Group : Groups) { - // Don't perform more than a certain amount of comparisons. - // This should limit the cost of grouping the pointers to something - // reasonable. If we do end up hitting this threshold, the algorithm - // will create separate groups for all remaining pointers. - if (TotalComparisons > MemoryCheckMergeThreshold) - break; - - TotalComparisons++; - - if (Group.addPointer(Pointer, *this)) { - Merged = true; - break; + for (unsigned Pointer : PointerI->second) { + bool Merged = false; + // Mark this pointer as seen. + Seen.insert(Pointer); + + // Go through all the existing sets and see if we can find one + // which can include this pointer. + for (RuntimeCheckingPtrGroup &Group : Groups) { + // Don't perform more than a certain amount of comparisons. + // This should limit the cost of grouping the pointers to something + // reasonable. If we do end up hitting this threshold, the algorithm + // will create separate groups for all remaining pointers. + if (TotalComparisons > MemoryCheckMergeThreshold) + break; + + TotalComparisons++; + + if (Group.addPointer(Pointer, *this)) { + Merged = true; + break; + } } - } - if (!Merged) - // We couldn't add this pointer to any existing set or the threshold - // for the number of comparisons has been reached. Create a new group - // to hold the current pointer. - Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this)); + if (!Merged) + // We couldn't add this pointer to any existing set or the threshold + // for the number of comparisons has been reached. Create a new group + // to hold the current pointer. + Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this)); + } } // We've computed the grouped checks for this partition. @@ -631,11 +635,8 @@ /// Check whether a pointer can participate in a runtime bounds check. /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr /// by adding run-time checks (overflow checks) if necessary. -static bool hasComputableBounds(PredicatedScalarEvolution &PSE, - const ValueToValueMap &Strides, Value *Ptr, - Loop *L, bool Assume) { - const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); - +static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr, + const SCEV *PtrScev, Loop *L, bool Assume) { // The bounds for loop-invariant pointer is trivial. if (PSE.getSE()->isLoopInvariant(PtrScev, L)) return true; @@ -698,34 +699,120 @@ bool Assume) { Value *Ptr = Access.getPointer(); - if (!hasComputableBounds(PSE, StridesMap, Ptr, TheLoop, Assume)) - return false; + auto TranslatePointers = [&](Value *Ptr) -> SmallVector { + ScalarEvolution &SE = *PSE.getSE(); + auto *GEP = dyn_cast(Ptr); + auto AssumeInBoundsFlags = [&]() { + if (!GEP->isInBounds()) + return false; + + // We'd like to propagate flags from the IR to the corresponding + // SCEV nodes, but to do that, we have to ensure that said flag is + // valid in the entire defined scope of the SCEV. + auto *GEPI = dyn_cast(GEP); + // TODO: non-instructions have global scope. We might be able to + // prove some global scope cases + return GEPI && programUndefinedIfPoison(GEPI); + }; + + if (GEP && GEP->getNumOperands() == 2) { + if (auto *SI = dyn_cast(GEP->getOperand(0))) { + const SCEV *BaseA = SE.getSCEV(SI->getOperand(1)); + const SCEV *BaseB = SE.getSCEV(SI->getOperand(2)); + const SCEV *Offset = SE.getSCEV(GEP->getOperand(1)); + if (SE.getTypeSizeInBits(Offset->getType()) < + SE.getTypeSizeInBits(BaseA->getType())) + Offset = SE.getSignExtendExpr( + Offset, SE.getEffectiveSCEVType(BaseA->getType())); + + SCEV::NoWrapFlags OffsetWrap = + AssumeInBoundsFlags() ? SCEV::FlagNSW : SCEV::FlagAnyWrap; + + Type *IntIdxTy = SE.getEffectiveSCEVType(BaseA->getType()); + auto *CurTy = GEP->getSourceElementType(); + const SCEV *ElementSize = SE.getSizeOfExpr(IntIdxTy, CurTy); + // Getelementptr indices are signed. + Offset = SE.getTruncateOrSignExtend(Offset, IntIdxTy); + + // Multiply the index by the element size to compute the element + // offset. + Offset = SE.getMulExpr(Offset, ElementSize, OffsetWrap); + auto *PtrA = SE.getAddExpr(BaseA, Offset, SCEV::FlagNUW); + auto *PtrB = SE.getAddExpr(BaseB, Offset, SCEV::FlagNUW); + return {PtrA, PtrB}; + } else if (match(GEP->getOperand(1), + m_ZExt(m_Select(m_Value(), m_Value(), m_Value())))) { + auto *ZExt = cast(GEP->getOperand(1)); + auto *SI = cast(ZExt->getOperand(0)); + const SCEV *OffsetA = SE.getSCEV(SI->getOperand(1)); + const SCEV *OffsetB = SE.getSCEV(SI->getOperand(2)); + + SCEV::NoWrapFlags OffsetWrap = + AssumeInBoundsFlags() ? SCEV::FlagNSW : SCEV::FlagAnyWrap; + + auto *Base = SE.getSCEV(GEP->getOperand(0)); + Type *IntIdxTy = SE.getEffectiveSCEVType(Base->getType()); + auto *CurTy = GEP->getSourceElementType(); + const SCEV *ElementSize = SE.getSizeOfExpr(IntIdxTy, CurTy); + // Getelementptr indices are signed. + OffsetA = SE.getTruncateOrSignExtend(OffsetA, IntIdxTy); + OffsetB = SE.getTruncateOrSignExtend(OffsetB, IntIdxTy); + + // Multiply the index by the element size to compute the element + // offset. + OffsetA = SE.getMulExpr(OffsetA, ElementSize, OffsetWrap); + OffsetB = SE.getMulExpr(OffsetB, ElementSize, OffsetWrap); + auto *PtrA = SE.getAddExpr(Base, OffsetA, SCEV::FlagNUW); + auto *PtrB = SE.getAddExpr(Base, OffsetB, SCEV::FlagNUW); + return {PtrA, PtrB}; + } + } + return {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr)}; + }; - // When we run after a failing dependency check we have to make sure - // we don't have wrapping pointers. - if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, TheLoop)) { - auto *Expr = PSE.getSCEV(Ptr); - if (!Assume || !isa(Expr)) + SmallVector TranslatedPtrs = TranslatePointers(Ptr); + + for (const SCEV *PtrExpr : TranslatedPtrs) { + if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume)) return false; - PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW); + + // When we run after a failing dependency check we have to make sure + // we don't have wrapping pointers. + if (ShouldCheckWrap) { + if (TranslatedPtrs.size() > 1) { + return false; + } + if (!isNoWrap(PSE, StridesMap, Ptr, TheLoop)) { + auto *Expr = PSE.getSCEV(Ptr); + if (!Assume || !isa(Expr)) + return false; + PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW); + } + } + // If there's only one option for Ptr, look it up after bounds and wrap + // checking, because assumptions might have been added to PSE. + if (TranslatedPtrs.size() == 1) + TranslatedPtrs[0] = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); } - // The id of the dependence set. - unsigned DepId; + for (const SCEV *PtrExpr : TranslatedPtrs) { + // The id of the dependence set. + unsigned DepId; - if (isDependencyCheckNeeded()) { - Value *Leader = DepCands.getLeaderValue(Access).getPointer(); - unsigned &LeaderId = DepSetId[Leader]; - if (!LeaderId) - LeaderId = RunningDepId++; - DepId = LeaderId; - } else - // Each access has its own dependence set. - DepId = RunningDepId++; + if (isDependencyCheckNeeded()) { + Value *Leader = DepCands.getLeaderValue(Access).getPointer(); + unsigned &LeaderId = DepSetId[Leader]; + if (!LeaderId) + LeaderId = RunningDepId++; + DepId = LeaderId; + } else + // Each access has its own dependence set. + DepId = RunningDepId++; - bool IsWrite = Access.getInt(); - RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE); - LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); + bool IsWrite = Access.getInt(); + RtCheck.insert(TheLoop, Ptr, PtrExpr, IsWrite, DepId, ASId, PSE); + LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); + } return true; } diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll --- a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll @@ -5,10 +5,37 @@ ; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset': ; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: cannot identify array bounds -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, float* %spec.select, i64 %indvars.iv +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, float* %spec.select, i64 %indvars.iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Dest High: (400 + %Dest)) +; CHECK-NEXT: Member: {%Dest,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Preds High: (400 + %Preds)) +; CHECK-NEXT: Member: {%Preds,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Base2 High: (400 + %Base2)) +; CHECK-NEXT: Member: {%Base2,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Base1 High: (400 + %Base1)) +; CHECK-NEXT: Member: {%Base1,+,4}<%for.body> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: @@ -50,15 +77,33 @@ ; CHECK-LABEL: function 'forked_ptrs_same_base_different_offset': ; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: cannot identify array bounds -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Grouped accesses: -; CHECK-EMPTY: -; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. -; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP1:.+]]): +; CHECK-NEXT: %arrayidx5 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group ([[GRP2:.+]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[GRP1]]): +; CHECK-NEXT: %arrayidx5 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group ([[GRP3:.+]]): +; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, float* %Base, i64 %idxprom213 +; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, float* %Base, i64 %idxprom213 +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP1]]: +; CHECK-NEXT: (Low: %Dest High: (400 + %Dest)) +; CHECK-NEXT: Member: {%Dest,+,4}<%for.body> +; CHECK-NEXT: Group [[GRP2]]: +; CHECK-NEXT: (Low: %Preds High: (400 + %Preds)) +; CHECK-NEXT: Member: {%Preds,+,4}<%for.body> +; CHECK-NEXT: Group [[GRP3]]: +; CHECK-NEXT: (Low: %Base High: (404 + %Base)) +; CHECK-NEXT: Member: {(4 + %Base),+,4}<%for.body> +; CHECK-NEXT: Member: {%Base,+,4}<%for.body> ; CHECK-EMPTY: -; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ;;;; Derived from the following C code ;; void forked_ptrs_same_base_different_offset(float *A, float *B, int *C) { diff --git a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll --- a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll +++ b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll @@ -17,22 +17,84 @@ define dso_local void @forked_ptrs_different_base_same_offset(float* nocapture readonly %Base1, float* nocapture readonly %Base2, float* nocapture %Dest, i32* nocapture readonly %Preds) { ; CHECK-LABEL: @forked_ptrs_different_base_same_offset( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[PREDS:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[BASE2:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP10:%.*]] = getelementptr float, float* [[BASE1:%.*]], i64 100 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP4]] to float* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[DEST]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32* +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[PREDS]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND012:%.*]] = icmp ugt float* [[SCEVGEP7]], [[DEST]] +; CHECK-NEXT: [[BOUND113:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE2]] +; CHECK-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT14]] +; CHECK-NEXT: [[BOUND015:%.*]] = icmp ugt float* [[SCEVGEP10]], [[DEST]] +; CHECK-NEXT: [[BOUND116:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE1]] +; CHECK-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]] +; CHECK-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT17]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float*> poison, float* [[BASE2]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT]], <4 x float*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x float*> poison, float* [[BASE1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT19]], <4 x float*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x float*> [[BROADCAST_SPLAT]], <4 x float*> [[BROADCAST_SPLAT20]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float*> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float*> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float*> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float*> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP15]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[TMP10]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP12]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP14]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[TMP16]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i32 2 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP24]], <4 x float>* [[TMP26]], align 4, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2:%.*]], float* [[BASE1:%.*]] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP28]], 0 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2]], float* [[BASE1]] ; CHECK-NEXT: [[DOTSINK_IN:%.*]] = getelementptr inbounds float, float* [[SPEC_SELECT]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[DOTSINK:%.*]] = load float, float* [[DOTSINK_IN]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[DOTSINK]], float* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[DOTSINK]], float* [[TMP29]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; entry: br label %for.body @@ -70,26 +132,83 @@ define dso_local void @forked_ptrs_same_base_different_offset(float* nocapture readonly %Base, float* nocapture %Dest, i32* nocapture readonly %Preds) { ; CHECK-LABEL: @forked_ptrs_same_base_different_offset( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[PREDS:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[BASE:%.*]], i64 101 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP4]] to float* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[DEST]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32* +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[PREDS]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND09:%.*]] = icmp ugt float* [[SCEVGEP7]], [[DEST]] +; CHECK-NEXT: [[BOUND110:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE]] +; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND13:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !12 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw <4 x i32> [[VEC_IND13]], +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_IND15]] +; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i32> [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[TMP9]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[TMP11]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP13]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP15]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i32 3 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP23]], <4 x float>* [[TMP25]], align 4, !alias.scope !17, !noalias !19 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT14]] = add <4 x i32> [[VEC_IND13]], +; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i32 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL12]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP27]], 0 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_014]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[OFFSET_0:%.*]] = select i1 [[CMP1_NOT]], i32 [[ADD]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[OFFSET_0:%.*]] = select i1 [[CMP1_NOT]], i32 [[ADD]], i32 [[TMP28]] ; CHECK-NEXT: [[IDXPROM213:%.*]] = zext i32 [[OFFSET_0]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[BASE:%.*]], i64 [[IDXPROM213]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[IDXPROM213]] +; CHECK-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[TMP29]], float* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; entry: br label %for.body