diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -406,9 +406,9 @@ /// according to the assumptions that we've made during the analysis. /// The method might also version the pointer stride according to \p Strides, /// and add new predicates to \p PSE. - void insert(Loop *Lp, Value *Ptr, Type *AccessTy, bool WritePtr, - unsigned DepSetId, unsigned ASId, const ValueToValueMap &Strides, - PredicatedScalarEvolution &PSE); + void insert(Loop *Lp, Value *Ptr, Type *AccessTy, const SCEV *PtrExpr, + bool WritePtr, unsigned DepSetId, unsigned ASId, + const ValueToValueMap &Strides, PredicatedScalarEvolution &PSE); /// No run-time memory checking is necessary. bool empty() const { return Pointers.empty(); } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -128,6 +128,11 @@ cl::desc("Enable conflict detection in loop-access analysis"), cl::init(true)); +static cl::opt MaxForkedSCEVDepth( + "max-forked-scev-depth", cl::Hidden, + cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"), + cl::init(5)); + bool VectorizerParams::isInterleaveForced() { return ::VectorizationInterleave.getNumOccurrences() > 0; } @@ -189,12 +194,12 @@ /// There is no conflict when the intervals are disjoint: /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, Type *AccessTy, - bool WritePtr, unsigned DepSetId, - unsigned ASId, + const SCEV *PtrExpr, bool WritePtr, + unsigned DepSetId, unsigned ASId, const ValueToValueMap &Strides, PredicatedScalarEvolution &PSE) { // Get the stride replaced scev. - const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); + const SCEV *Sc = PtrExpr; // replaceSymbolicStrideSCEV(PSE, Strides, Ptr); ScalarEvolution *SE = PSE.getSE(); const SCEV *ScStart; @@ -370,9 +375,12 @@ unsigned TotalComparisons = 0; - DenseMap PositionMap; - for (unsigned Index = 0; Index < Pointers.size(); ++Index) - PositionMap[Pointers[Index].PointerValue] = Index; + DenseMap> PositionMap; + for (unsigned Index = 0; Index < Pointers.size(); ++Index) { + auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}}); + Iter.first->second.push_back(Index); + } + // PositionMap[Pointers[Index].PointerValue] = Index; // We need to keep track of what pointers we've already seen so we // don't process them twice. @@ -403,34 +411,35 @@ auto PointerI = PositionMap.find(MI->getPointer()); assert(PointerI != PositionMap.end() && "pointer in equivalence class not found in PositionMap"); - unsigned Pointer = PointerI->second; - bool Merged = false; - // Mark this pointer as seen. - Seen.insert(Pointer); - - // Go through all the existing sets and see if we can find one - // which can include this pointer. - for (RuntimeCheckingPtrGroup &Group : Groups) { - // Don't perform more than a certain amount of comparisons. - // This should limit the cost of grouping the pointers to something - // reasonable. If we do end up hitting this threshold, the algorithm - // will create separate groups for all remaining pointers. - if (TotalComparisons > MemoryCheckMergeThreshold) - break; - - TotalComparisons++; - - if (Group.addPointer(Pointer, *this)) { - Merged = true; - break; + for (unsigned Pointer : PointerI->second) { + bool Merged = false; + // Mark this pointer as seen. + Seen.insert(Pointer); + + // Go through all the existing sets and see if we can find one + // which can include this pointer. + for (RuntimeCheckingPtrGroup &Group : Groups) { + // Don't perform more than a certain amount of comparisons. + // This should limit the cost of grouping the pointers to something + // reasonable. If we do end up hitting this threshold, the algorithm + // will create separate groups for all remaining pointers. + if (TotalComparisons > MemoryCheckMergeThreshold) + break; + + TotalComparisons++; + + if (Group.addPointer(Pointer, *this)) { + Merged = true; + break; + } } - } - if (!Merged) - // We couldn't add this pointer to any existing set or the threshold - // for the number of comparisons has been reached. Create a new group - // to hold the current pointer. - Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this)); + if (!Merged) + // We couldn't add this pointer to any existing set or the threshold + // for the number of comparisons has been reached. Create a new group + // to hold the current pointer. + Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this)); + } } // We've computed the grouped checks for this partition. @@ -629,10 +638,9 @@ /// Check whether a pointer can participate in a runtime bounds check. /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr /// by adding run-time checks (overflow checks) if necessary. -static bool hasComputableBounds(PredicatedScalarEvolution &PSE, - const ValueToValueMap &Strides, Value *Ptr, - Loop *L, bool Assume) { - const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); +static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr, + const SCEV *PtrScev, Loop *L, bool Assume) { + // const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); // The bounds for loop-invariant pointer is trivial. if (PSE.getSE()->isLoopInvariant(PtrScev, L)) @@ -687,6 +695,176 @@ } } +// Walk back through the IR for a pointer, looking for a select like the +// following: +// +// %offset = select i1 %cmp, i64 %a, i64 %b +// %addr = getelementptr double, double* %base, i64 %offset +// %ld = load double, double* %addr, align 8 +// +// We won't be able to form a single SCEVAddRecExpr from this since the +// address for each loop iteration depends on %cmp. We could potentially +// produce multiple valid SCEVAddRecExprs, though, and check all of them for +// memory safety/aliasing if needed. +// +// If we encounter some IR we don't yet handle, or something obviously fine +// like a constant, then we just add the SCEV for that term to the list passed +// in by the caller. If we have a node that may potentially yield a valid +// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms +// ourselves before adding to the list. +static void findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr, + SmallVectorImpl &ScevList, + unsigned Depth) { + // If our Value is loop invariant or a SCEVAddRecExpr, we already have + // a usable value. If it's not an instruction or we've exceeded our limit + // on recursion, just return whatever we have regardless of whether it can + // be used for a forked pointer or not. + const SCEV *Scev = SE->getSCEV(Ptr); + if (SE->isLoopInvariant(Scev, L) || isa(Scev) || + !isa(Ptr) || Depth == 0) { + ScevList.push_back(Scev); + return; + } + + Depth--; + + auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) { + switch (Opcode) { + case Instruction::Add: + return SE->getAddExpr(L, R); + case Instruction::Sub: + return SE->getMinusSCEV(L, R); + case Instruction::Mul: + return SE->getMulExpr(L, R); + default: + llvm_unreachable("Unexpected binary operator when walking ForkedPtrs"); + } + }; + + Instruction *I = cast(Ptr); + unsigned Opcode = I->getOpcode(); + switch (Opcode) { + case Instruction::BitCast: + findForkedSCEVs(SE, L, I->getOperand(0), ScevList, Depth); + break; + case Instruction::SExt: + case Instruction::ZExt: { + SmallVector ExtScevs; + findForkedSCEVs(SE, L, I->getOperand(0), ExtScevs, Depth); + for (const SCEV *Scev : ExtScevs) + if (Opcode == Instruction::SExt) + ScevList.push_back(SE->getSignExtendExpr(Scev, I->getType())); + else + ScevList.push_back(SE->getZeroExtendExpr(Scev, I->getType())); + break; + } + case Instruction::GetElementPtr: { + GetElementPtrInst *GEP = cast(I); + Type *SourceTy = GEP->getSourceElementType(); + // We only handle base + single offset GEPs here for now. + // Not dealing with preexisting gathers yet, so no vectors. + if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) { + ScevList.push_back(Scev); + break; + } + SmallVector BaseScevs; + SmallVector OffsetScevs; + findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth); + findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth); + + // Make sure we get the correct pointer type to extend to, including the + // address space. + const SCEV *BaseExpr = SE->getSCEV(GEP->getPointerOperand()); + Type *IntPtrTy = SE->getEffectiveSCEVType(BaseExpr->getType()); + SCEV::NoWrapFlags Wrap = + GEP->isInBounds() ? SCEV::FlagNSW : SCEV::FlagAnyWrap; + // Find the size of the type being pointed to. We only have a single + // index term (guarded above) so we don't need to index into arrays or + // structures, just get the size of the scalar value. + const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy); + + if (OffsetScevs.size() == 2 && BaseScevs.size() == 1) { + const SCEV *Off1 = SE->getTruncateOrSignExtend(OffsetScevs[0], IntPtrTy); + const SCEV *Off2 = SE->getTruncateOrSignExtend(OffsetScevs[1], IntPtrTy); + const SCEV *Mul1 = SE->getMulExpr(Size, Off1, Wrap); + const SCEV *Mul2 = SE->getMulExpr(Size, Off2, Wrap); + const SCEV *Add1 = SE->getAddExpr(BaseScevs[0], Mul1, Wrap); + const SCEV *Add2 = SE->getAddExpr(BaseScevs[0], Mul2, Wrap); + ScevList.push_back(Add1); + ScevList.push_back(Add2); + } else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1) { + const SCEV *Off = SE->getTruncateOrSignExtend(OffsetScevs[0], IntPtrTy); + const SCEV *Mul = SE->getMulExpr(Size, Off, Wrap); + const SCEV *Add1 = SE->getAddExpr(BaseScevs[0], Mul, Wrap); + const SCEV *Add2 = SE->getAddExpr(BaseScevs[1], Mul, Wrap); + ScevList.push_back(Add1); + ScevList.push_back(Add2); + } else + ScevList.push_back(Scev); + break; + } + case Instruction::Select: { + SmallVector ChildScevs; + // A select means we've found a forked pointer, but we currently only + // support a single select per pointer so if there's another behind this + // then we just bail out and return the generic SCEV. + findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth); + findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth); + if (ChildScevs.size() == 2) { + ScevList.push_back(ChildScevs[0]); + ScevList.push_back(ChildScevs[1]); + } else + ScevList.push_back(Scev); + break; + } + // If adding another binop to this list, update GetBinOpExpr above + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: { + SmallVector LScevs; + SmallVector RScevs; + findForkedSCEVs(SE, L, I->getOperand(0), LScevs, Depth); + findForkedSCEVs(SE, L, I->getOperand(1), RScevs, Depth); + if (LScevs.size() == 2 && RScevs.size() == 1) { + const SCEV *Op1 = GetBinOpExpr(Opcode, LScevs[0], RScevs[0]); + const SCEV *Op2 = GetBinOpExpr(Opcode, LScevs[1], RScevs[0]); + ScevList.push_back(Op1); + ScevList.push_back(Op2); + } else if (LScevs.size() == 1 && RScevs.size() == 2) { + const SCEV *Op1 = GetBinOpExpr(Opcode, LScevs[0], RScevs[0]); + const SCEV *Op2 = GetBinOpExpr(Opcode, LScevs[0], RScevs[1]); + ScevList.push_back(Op1); + ScevList.push_back(Op2); + } else + ScevList.push_back(Scev); + break; + } + default: + // Just return the current SCEV if we haven't handled the instruction yet. + LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n"); + ScevList.push_back(Scev); + break; + } + + return; +} + +static SmallVector +findForkedPointer(PredicatedScalarEvolution &PSE, + const ValueToValueMap &StridesMap, Value *Ptr, + const Loop *L) { + ScalarEvolution *SE = PSE.getSE(); + assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!"); + SmallVector Scevs; + findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth); + + // For now, we will only accept a forked pointer with two options. + if (Scevs.size() == 2) + return std::move(Scevs); + + return {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr)}; +} + bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy, const ValueToValueMap &StridesMap, @@ -696,34 +874,54 @@ bool Assume) { Value *Ptr = Access.getPointer(); - if (!hasComputableBounds(PSE, StridesMap, Ptr, TheLoop, Assume)) - return false; + SmallVector TranslatedPtrs = + findForkedPointer(PSE, StridesMap, Ptr, TheLoop); - // When we run after a failing dependency check we have to make sure - // we don't have wrapping pointers. - if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) { - auto *Expr = PSE.getSCEV(Ptr); - if (!Assume || !isa(Expr)) + for (const SCEV *PtrExpr : TranslatedPtrs) { + if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume)) return false; - PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW); + + // When we run after a failing dependency check we have to make sure + // we don't have wrapping pointers. + if (ShouldCheckWrap) { + // If we forked a pointer via a select, don't check for wrapping + // behaviour. + // TODO: Implement this; requires checking the SCEVs individually + // instead of the overall ptr, since that just resolves to a SCEVUnknown. + if (TranslatedPtrs.size() > 1) + return false; + + if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) { + auto *Expr = PSE.getSCEV(Ptr); + if (!Assume || !isa(Expr)) + return false; + PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW); + } + } + // Is this needed? I think the method above forced it, so can be skipped? + if (TranslatedPtrs.size() == 1) + TranslatedPtrs[0] = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); } - // The id of the dependence set. - unsigned DepId; + for (const SCEV *PtrExpr : TranslatedPtrs) { + // The id of the dependence set. + unsigned DepId; - if (isDependencyCheckNeeded()) { - Value *Leader = DepCands.getLeaderValue(Access).getPointer(); - unsigned &LeaderId = DepSetId[Leader]; - if (!LeaderId) - LeaderId = RunningDepId++; - DepId = LeaderId; - } else - // Each access has its own dependence set. - DepId = RunningDepId++; + if (isDependencyCheckNeeded()) { + Value *Leader = DepCands.getLeaderValue(Access).getPointer(); + unsigned &LeaderId = DepSetId[Leader]; + if (!LeaderId) + LeaderId = RunningDepId++; + DepId = LeaderId; + } else + // Each access has its own dependence set. + DepId = RunningDepId++; - bool IsWrite = Access.getInt(); - RtCheck.insert(TheLoop, Ptr, AccessTy, IsWrite, DepId, ASId, StridesMap, PSE); - LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); + bool IsWrite = Access.getInt(); + RtCheck.insert(TheLoop, Ptr, AccessTy, PtrExpr, IsWrite, DepId, ASId, + StridesMap, PSE); + LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); + } return true; } diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll --- a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll @@ -3,16 +3,43 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" ; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset': -; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: cannot identify array bounds -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, float* %spec.select, i64 %indvars.iv +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, float* %spec.select, i64 %indvars.iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Dest High: (400 + %Dest)) +; CHECK-NEXT: Member: {%Dest,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Preds High: (400 + %Preds)) +; CHECK-NEXT: Member: {%Preds,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Base2 High: (400 + %Base2)) +; CHECK-NEXT: Member: {%Base2,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Base1 High: (400 + %Base1)) +; CHECK-NEXT: Member: {%Base1,+,4}<%for.body> ; CHECK-EMPTY: -; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. -; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: ; CHECK-EMPTY: -; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: Expressions re-written: ;;;; Derived from the following C code ;; void forked_ptrs_different_base_same_offset(float *A, float *B, float *C, int *D) { @@ -48,16 +75,37 @@ } ; CHECK-LABEL: function 'forked_ptrs_same_base_different_offset': -; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: cannot identify array bounds -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %arrayidx5 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %arrayidx5 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, float* %Base, i64 %idxprom213 +; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, float* %Base, i64 %idxprom213 +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Dest High: (400 + %Dest)) +; CHECK-NEXT: Member: {%Dest,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Preds High: (400 + %Preds)) +; CHECK-NEXT: Member: {%Preds,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Base High: (404 + %Base)) +; CHECK-NEXT: Member: {(4 + %Base),+,4}<%for.body> +; CHECK-NEXT: Member: {%Base,+,4}<%for.body> ; CHECK-EMPTY: -; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. -; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: ; CHECK-EMPTY: -; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: Expressions re-written: ;;;; Derived from the following C code ;; void forked_ptrs_same_base_different_offset(float *A, float *B, int *C) { @@ -97,19 +145,38 @@ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } -;;;; Cases that can be handled by a forked pointer but are not currently allowed. - ; CHECK-LABEL: function 'forked_ptrs_uniform_and_strided_forks': -; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: cannot identify array bounds -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %arrayidx5 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %arrayidx5 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, float* %Base, i64 %idxprom213 +; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, float* %Base, i64 %idxprom213 +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Dest High: (400 + %Dest)) +; CHECK-NEXT: Member: {%Dest,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Preds High: (400 + %Preds)) +; CHECK-NEXT: Member: {%Preds,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Base High: (1192 + %Base)) +; CHECK-NEXT: Member: (16 + %Base) +; CHECK-NEXT: Member: {%Base,+,12}<%for.body> ; CHECK-EMPTY: -; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. -; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: ; CHECK-EMPTY: -; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: Expressions re-written: ;;;; Derived from forked_ptrs_same_base_different_offset with a manually ;;;; added uniform offset and a mul to provide a stride @@ -141,6 +208,8 @@ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +;;;; Cases that can be handled by a forked pointer but are not currently allowed. + ; CHECK-LABEL: function 'forked_ptrs_gather_and_contiguous_forks': ; CHECK-NEXT: for.body: ; CHECK-NEXT: Report: cannot identify array bounds diff --git a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll --- a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll +++ b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll @@ -17,22 +17,84 @@ define dso_local void @forked_ptrs_different_base_same_offset(float* nocapture readonly %Base1, float* nocapture readonly %Base2, float* nocapture %Dest, i32* nocapture readonly %Preds) { ; CHECK-LABEL: @forked_ptrs_different_base_same_offset( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[PREDS:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[BASE2:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP10:%.*]] = getelementptr float, float* [[BASE1:%.*]], i64 100 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP4]] to float* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[DEST]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32* +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[PREDS]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND012:%.*]] = icmp ugt float* [[SCEVGEP7]], [[DEST]] +; CHECK-NEXT: [[BOUND113:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE2]] +; CHECK-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT14]] +; CHECK-NEXT: [[BOUND015:%.*]] = icmp ugt float* [[SCEVGEP10]], [[DEST]] +; CHECK-NEXT: [[BOUND116:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE1]] +; CHECK-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]] +; CHECK-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT17]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float*> poison, float* [[BASE2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT]], <4 x float*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x float*> poison, float* [[BASE1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT19]], <4 x float*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x float*> [[BROADCAST_SPLAT]], <4 x float*> [[BROADCAST_SPLAT20]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float*> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float*> [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float*> [[TMP8]], i64 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float*> [[TMP8]], i64 3 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP15]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[TMP10]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP12]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP14]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[TMP16]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i64 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i64 2 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP20]], i64 3 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP24]], <4 x float>* [[TMP26]], align 4, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2:%.*]], float* [[BASE1:%.*]] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP28]], 0 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2]], float* [[BASE1]] ; CHECK-NEXT: [[DOTSINK_IN:%.*]] = getelementptr inbounds float, float* [[SPEC_SELECT]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[DOTSINK:%.*]] = load float, float* [[DOTSINK_IN]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[DOTSINK]], float* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[DOTSINK]], float* [[TMP29]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; entry: br label %for.body @@ -70,26 +132,83 @@ define dso_local void @forked_ptrs_same_base_different_offset(float* nocapture readonly %Base, float* nocapture %Dest, i32* nocapture readonly %Preds) { ; CHECK-LABEL: @forked_ptrs_same_base_different_offset( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[PREDS:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[BASE:%.*]], i64 101 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP4]] to float* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[DEST]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32* +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[PREDS]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND09:%.*]] = icmp ugt float* [[SCEVGEP7]], [[DEST]] +; CHECK-NEXT: [[BOUND110:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE]] +; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND13:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !12 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw <4 x i32> [[VEC_IND13]], +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_IND15]] +; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i32> [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[TMP9]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[TMP11]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP13]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP15]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP16]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i64 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i64 2 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i64 3 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP23]], <4 x float>* [[TMP25]], align 4, !alias.scope !17, !noalias !19 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT14]] = add <4 x i32> [[VEC_IND13]], +; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i32 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL12]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP27]], 0 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_014]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[OFFSET_0:%.*]] = select i1 [[CMP1_NOT]], i32 [[ADD]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[OFFSET_0:%.*]] = select i1 [[CMP1_NOT]], i32 [[ADD]], i32 [[TMP28]] ; CHECK-NEXT: [[IDXPROM213:%.*]] = zext i32 [[OFFSET_0]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[BASE:%.*]], i64 [[IDXPROM213]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[IDXPROM213]] +; CHECK-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[TMP29]], float* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; entry: br label %for.body