Index: include/llvm/Analysis/LoopInfo.h =================================================================== --- include/llvm/Analysis/LoopInfo.h +++ include/llvm/Analysis/LoopInfo.h @@ -546,6 +546,7 @@ /// from being unrolled more than is directed by a pragma if the loop /// unrolling pass is run more than once (which it generally is). void setLoopAlreadyUnrolled(); + void setLoopAlreadyFlattened(); /// Return true if no exit block for the loop has a predecessor that is /// outside the loop. Index: lib/Analysis/LoopInfo.cpp =================================================================== --- lib/Analysis/LoopInfo.cpp +++ lib/Analysis/LoopInfo.cpp @@ -301,6 +301,41 @@ setLoopID(NewLoopID); } +void Loop::setLoopAlreadyFlattened() { + MDNode *LoopID = getLoopID(); + // First remove any existing loop flattening metadata. + SmallVector MDs; + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + bool IsFlattenMetadata = false; + MDNode *MD = dyn_cast(LoopID->getOperand(i)); + if (MD) { + const MDString *S = dyn_cast(MD->getOperand(0)); + IsFlattenMetadata = + S && S->getString().startswith("llvm.loop.flatten."); + } + if (!IsFlattenMetadata) + MDs.push_back(LoopID->getOperand(i)); + } + } + + // Add flatten(disable) metadata to disable future flattening. + LLVMContext &Context = getHeader()->getContext(); + SmallVector DisableOperands; + DisableOperands.push_back( + MDString::get(Context, "llvm.loop.flatten.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + setLoopID(NewLoopID); +} + bool Loop::isAnnotatedParallel() const { MDNode *DesiredLoopIdMetadata = getLoopID(); Index: lib/Transforms/Scalar/LoopFlatten.cpp =================================================================== --- lib/Transforms/Scalar/LoopFlatten.cpp +++ lib/Transforms/Scalar/LoopFlatten.cpp @@ -45,6 +45,8 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" #define DEBUG_TYPE "loop-flatten" @@ -62,6 +64,12 @@ cl::desc("Assume that the product of the two iteration " "limits will never overflow")); +static MDNode *GetMetadataForLoop(const Loop *L, StringRef Name) { + if (MDNode *LoopID = L->getLoopID()) + return GetUnrollMetadata(LoopID, Name); + return nullptr; +} + // Finds the induction variable, increment and limit for a simple loop that we // can flatten. static bool findLoopComponents( @@ -395,7 +403,8 @@ static bool FlattenLoopPair(Loop *OuterLoop, Loop *InnerLoop, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, - AssumptionCache *AC, TargetTransformInfo *TTI, + const LoopAccessInfo *LAI, AssumptionCache *AC, + TargetTransformInfo *TTI, std::function markLoopAsDeleted) { Function *F = OuterLoop->getHeader()->getParent(); @@ -458,7 +467,6 @@ // Check if the new iteration variable might overflow. In this case, we // need to version the loop, and select the original version at runtime if // the iteration space is too large. - // TODO: We currently don't version the loop. // TODO: it might be worth using a wider iteration variable rather than // versioning the loop, if a wide enough type is legal. bool MustVersionLoop = true; @@ -468,14 +476,15 @@ DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n"); return false; } else if (OR == OverflowResult::MayOverflow) { - DEBUG(dbgs() << "Multiply might overflow, not flattening\n"); + DEBUG(dbgs() << "Multiply might overflow, versioning loop\n"); } else { DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n"); MustVersionLoop = false; } - // We cannot safely flatten the loop. Exit now. - if (MustVersionLoop) + // This optimisation is good for code size iff we don't need to do loop + // versioning. + if (F->optForSize() && MustVersionLoop) return false; // Do the actual transformation. @@ -485,15 +494,42 @@ OptimizationRemark Remark(DEBUG_TYPE, "Flattened", InnerLoop->getStartLoc(), InnerLoop->getHeader()); OptimizationRemarkEmitter ORE(F); - Remark << "Flattened into outer loop"; + if (!MustVersionLoop) + Remark << "Flattened into outer loop in-place"; + else + Remark << "Flattened into outer loop, leaving original version when " + "overflow occurs"; ORE.emit(Remark); } - Value *NewTripCount = - BinaryOperator::CreateMul(InnerLimit, OuterLimit, "flatten.tripcount", - OuterLoop->getLoopPreheader()->getTerminator()); - DEBUG(dbgs() << "Created new trip count in preheader: "; - NewTripCount->dump()); + // TODO: Widen IV type if legal to prevent overflow. + Type *NewIVType = InnerInductionPHI->getType(); + Value *NewTripCount; + Loop *FallbackLoop = nullptr; + if (MustVersionLoop) { + IRBuilder<> Builder(OuterLoop->getLoopPreheader()->getTerminator()); + // FIXME: signedness of this check? + Value *M = Intrinsic::getDeclaration( + F->getParent(), Intrinsic::umul_with_overflow, NewIVType); + CallInst *Call = Builder.CreateCall(M, {InnerLimit, OuterLimit}, "limit"); + Value *OverflowBit = Builder.CreateExtractValue(Call, 1, "overflow"); + NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount"); + + LoopVersioning LV(*LAI, OuterLoop, LI, DT, SE, false); + SCEVUnionPredicate Pred; + Pred.add( + SE->getEqualPredicate(SE->getSCEV(OverflowBit), + SE->getZero(Type::getInt1Ty(M->getContext())))); + LV.setSCEVChecks(Pred); + LV.versionLoop(); + FallbackLoop = LV.getNonVersionedLoop(); + } else { + NewTripCount = BinaryOperator::CreateMul( + InnerLimit, OuterLimit, "flatten.tripcount", + OuterLoop->getLoopPreheader()->getTerminator()); + DEBUG(dbgs() << "Created new trip count in preheader: "; + NewTripCount->dump()); + } // Fix up PHI nodes that take values from the inner loop back-edge, which // we are about to remove. @@ -517,6 +553,12 @@ for (Value *V : LinearIVUses) V->replaceAllUsesWith(OuterInductionPHI); + // If we made a fallback copy of the loop, it will still be flattenable if + // this pass is run again, but that wouldn't be profitable, so disable + // flattening of that loop. + if (FallbackLoop) + FallbackLoop->setLoopAlreadyFlattened(); + // Tell LoopInfo, SCEV and the pass manager that the inner loop has been // deleted, and any information that have about the outer loop invalidated. markLoopAsDeleted(InnerLoop); @@ -530,16 +572,21 @@ PreservedAnalyses LoopFlattenPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &Updater) { - if (L.getSubLoops().size() != 1) + if (L.getSubLoops().size() != 1 || + GetMetadataForLoop(&L, "llvm.loop.flatten.disable")) return PreservedAnalyses::all(); + const LoopAccessInfo *LAI = &AM.getResult(L, AR); + Loop *InnerLoop = *L.begin(); std::string LoopName = InnerLoop->getName(); if (!FlattenLoopPair( - &L, InnerLoop, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, + &L, InnerLoop, &AR.DT, &AR.LI, &AR.SE, LAI, &AR.AC, &AR.TTI, [&](Loop *L) { Updater.markLoopAsDeleted(*L, LoopName); })) return PreservedAnalyses::all(); - return getLoopPassPreservedAnalyses(); + PreservedAnalyses PA = getLoopPassPreservedAnalyses(); + PA.preserve(); + return PA; } namespace { @@ -559,6 +606,8 @@ AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } }; } // namespace @@ -568,6 +617,7 @@ false, false) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops", false, false) @@ -578,7 +628,8 @@ if (skipLoop(L)) return false; - if (L->getSubLoops().size() != 1) + if (L->getSubLoops().size() != 1 || + GetMetadataForLoop(L, "llvm.loop.flatten.disable")) return false; ScalarEvolution *SE = &getAnalysis().getSE(); @@ -590,8 +641,10 @@ AssumptionCache *AC = &getAnalysis().getAssumptionCache( *L->getHeader()->getParent()); + const LoopAccessInfo *LAI = + &getAnalysis().getInfo(L); Loop *InnerLoop = *L->begin(); - return FlattenLoopPair(L, InnerLoop, DT, LI, SE, AC, TTI, + return FlattenLoopPair(L, InnerLoop, DT, LI, SE, LAI, AC, TTI, [&](Loop *L) { LPM.markLoopAsDeleted(*L); }); } Index: lib/Transforms/Utils/CloneFunction.cpp =================================================================== --- lib/Transforms/Utils/CloneFunction.cpp +++ lib/Transforms/Utils/CloneFunction.cpp @@ -728,6 +728,18 @@ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); } +static void cloneLoopStructure(Loop *OrigLoop, Loop *ParentLoop, LoopInfo *LI, + DenseMap &LoopMap) { + Loop *NewLoop = LI->AllocateLoop(); + LoopMap[OrigLoop] = NewLoop; + if (ParentLoop) + ParentLoop->addChildLoop(NewLoop); + else + LI->addTopLevelLoop(NewLoop); + for (Loop *Child : *OrigLoop) + cloneLoopStructure(Child, NewLoop, LI, LoopMap); +} + /// \brief Clones a loop \p OrigLoop. Returns the loop and the blocks in \p /// Blocks. /// @@ -738,16 +750,11 @@ const Twine &NameSuffix, LoopInfo *LI, DominatorTree *DT, SmallVectorImpl &Blocks) { - assert(OrigLoop->getSubLoops().empty() && - "Loop to be cloned cannot have inner loop"); Function *F = OrigLoop->getHeader()->getParent(); Loop *ParentLoop = OrigLoop->getParentLoop(); - Loop *NewLoop = LI->AllocateLoop(); - if (ParentLoop) - ParentLoop->addChildLoop(NewLoop); - else - LI->addTopLevelLoop(NewLoop); + DenseMap LoopMap; + cloneLoopStructure(OrigLoop, ParentLoop, LI, LoopMap); BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); assert(OrigPH && "No preheader"); @@ -768,6 +775,8 @@ VMap[BB] = NewBB; // Update LoopInfo. + Loop *NewLoop = LoopMap[LI->getLoopFor(BB)]; + assert(NewLoop); NewLoop->addBasicBlockToLoop(NewBB, *LI); // Add DominatorTree node. After seeing all blocks, update to correct IDom. @@ -787,9 +796,10 @@ F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), NewPH); F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), - NewLoop->getHeader()->getIterator(), F->end()); + LoopMap[OrigLoop]->getHeader()->getIterator(), + F->end()); - return NewLoop; + return LoopMap[OrigLoop]; } /// \brief Duplicate non-Phi instructions from the beginning of block up to Index: lib/Transforms/Utils/LoopVersioning.cpp =================================================================== --- lib/Transforms/Utils/LoopVersioning.cpp +++ lib/Transforms/Utils/LoopVersioning.cpp @@ -64,11 +64,10 @@ std::tie(FirstCheckInst, MemRuntimeCheck) = LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks); - const SCEVUnionPredicate &Pred = LAI.getPSE().getUnionPredicate(); SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(), "scev.check"); SCEVRuntimeCheck = - Exp.expandCodeForPredicate(&Pred, RuntimeCheckBB->getTerminator()); + Exp.expandCodeForPredicate(&Preds, RuntimeCheckBB->getTerminator()); auto *CI = dyn_cast(SCEVRuntimeCheck); // Discard the SCEV runtime check if it is always true. Index: test/Transforms/LoopFlatten/loop-flatten.ll =================================================================== --- test/Transforms/LoopFlatten/loop-flatten.ll +++ test/Transforms/LoopFlatten/loop-flatten.ll @@ -529,5 +529,256 @@ ret i16 %ret.0.lcssa } +; CHECK-LABEL: test8 +; Versioned loop +define void @test8(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %cmp25 = icmp sgt i32 %N, 0 + br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup +; Entry block still contains the zero-iteration check +; CHECK: entry: +; CHECK: %[[ZERO_CHECK:.*]] = icmp sgt i32 %N, 0 +; CHECK: br i1 %[[ZERO_CHECK]], label %for.body4.lr.ph.lver.check, label %for.cond.cleanup + +; Loop versioning check block +; CHECK: for.body4.lr.ph.lver.check: +; CHECK: call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %N, i32 %N) +; CHECK: extractvalue +; CHECK: br i1 %{{.*}}, label %for.body4.lr.ph.ph.lver.orig, label %for.body4.lr.ph.ph + +; Pre-header for the original loop +; CHECK: for.body4.lr.ph.ph.lver.orig: +; CHECK: br label %for.body4.lr.ph.lver.orig + + +; Original loop kept as-is (just with blocks renamed) +for.body4.lr.ph: + %i.026 = phi i32 [ %inc10, %for.cond.cleanup3 ], [ 0, %entry ] + %mul = mul nsw i32 %i.026, %N + br label %for.body4 +; CHECK: for.body4.lr.ph.lver.orig: +; CHECK: br label %for.body4.lver.orig + +for.body4: + %j.024 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ] + %add = add nsw i32 %j.024, %mul + %use = add i32 %add, 10 + %inc = add nuw nsw i32 %j.024, 1 + %exitcond = icmp ne i32 %inc, %N + br i1 %exitcond, label %for.body4, label %for.cond.cleanup3 +; CHECK: for.body4.lver.orig: +; CHECK: br i1 %{{.*}}, label %for.body4.lver.orig, label %for.cond.cleanup3.lver.orig + +for.cond.cleanup3: + %inc10 = add nuw nsw i32 %i.026, 1 + %exitcond27 = icmp ne i32 %inc10, %N + br i1 %exitcond27, label %for.body4.lr.ph, label %for.cond.cleanup +; CHECK: for.cond.cleanup3.lver.orig: +; CHECK: br i1 %{{.*}}, label %for.body4.lr.ph.lver.orig, label %for.cond.cleanup.loopexit + + +; New, flattened loop +; Pre-header +; CHECK: for.body4.lr.ph.ph: +; CHECK: br label %for.body4.lr.ph + +; Header +; The next 3 blocks now have straight-line control flow, and will get merged by SimplifyCFG +; CHECK: for.body4.lr.ph: +; CHECK: %[[OUTER_IV:.*]] = phi i32 +; CHECK: br label %for.body4 + +; (Former) inner loop body +; CHECK: for.body4: +; Operands of %use have been replaced with the (formerly) outer iteration variable +; CHECK: add i32 %[[OUTER_IV]], 10 +; Branch to tail portion of outer loop body is now unconditional +; CHECK: br label %for.cond.cleanup3 + +; Latch block +; CHECK: for.cond.cleanup3: +; CHECK: br i1 %exitcond27, label %for.body4.lr.ph, label %for.cond.cleanup.loopexit + + +; Exit block, shared between both loop versions +; CHECK: for.cond.cleanup.loopexit: +; CHECK: br label %for.cond.cleanup + +; Function exit block (no change) +for.cond.cleanup: + ret void +; CHECK: for.cond.cleanup: +; CHECK: ret void +} + +; As above, but with PHI operands re-ordered +define void @test9(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +; CHECK-LABEL: test9 +entry: + %cmp25 = icmp sgt i32 %N, 0 + br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup +; Entry block still contains the zero-iteration check +; CHECK: entry: +; CHECK: %[[ZERO_CHECK:.*]] = icmp sgt i32 %N, 0 +; CHECK: br i1 %[[ZERO_CHECK]], label %for.body4.lr.ph.lver.check, label %for.cond.cleanup + +; Loop versioning check block +; CHECK: for.body4.lr.ph.lver.check: +; CHECK: call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %N, i32 %N) +; CHECK: extractvalue +; CHECK: br i1 %{{.*}}, label %for.body4.lr.ph.ph.lver.orig, label %for.body4.lr.ph.ph + +; Pre-header for the original loop +; CHECK: for.body4.lr.ph.ph.lver.orig: +; CHECK: br label %for.body4.lr.ph.lver.orig + + +; Original loop kept as-is (just with blocks renamed) +for.body4.lr.ph: + %i.026 = phi i32 [ 0, %entry ], [ %inc10, %for.cond.cleanup3 ] + %mul = mul nsw i32 %i.026, %N + br label %for.body4 +; CHECK: for.body4.lr.ph.lver.orig: +; CHECK: br label %for.body4.lver.orig + +for.body4: + %j.024 = phi i32 [ %inc, %for.body4 ], [ 0, %for.body4.lr.ph ] + %add = add nsw i32 %j.024, %mul + %use = add i32 %add, 10 + %inc = add nuw nsw i32 %j.024, 1 + %exitcond = icmp ne i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup3, label %for.body4 +; CHECK: for.body4.lver.orig: +; CHECK: br i1 %{{.*}}, label %for.cond.cleanup3.lver.orig, label %for.body4.lver.orig + +for.cond.cleanup3: + %inc10 = add nuw nsw i32 %i.026, 1 + %exitcond27 = icmp ne i32 %inc10, %N + br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph +; CHECK: for.cond.cleanup3.lver.orig: +; CHECK: br i1 %{{.*}}, label %for.cond.cleanup.loopexit, label %for.body4.lr.ph.lver.orig + + +; New, flattened loop +; Pre-header +; CHECK: for.body4.lr.ph.ph: +; CHECK: br label %for.body4.lr.ph + +; Header +; CHECK: for.body4.lr.ph: +; CHECK: %[[OUTER_IV:.*]] = phi i32 +; CHECK: br label %for.body4 + +; (Former) inner loop body +; CHECK: for.body4: +; CHECK: add i32 %[[OUTER_IV]], 10 +; CHECK: br label %for.cond.cleanup3 + +; Latch block +; CHECK: for.cond.cleanup3: +; CHECK: br i1 %exitcond27, label %for.cond.cleanup.loopexit, label %for.body4.lr.ph + + +; Exit block, shared between both loop versions +; CHECK: for.cond.cleanup.loopexit: +; CHECK: br label %for.cond.cleanup + +; Function exit block (no change) +for.cond.cleanup: + ret void +; CHECK: for.cond.cleanup: +; CHECK: ret void +} + +; The two loops have different bounds +define void @test10(i32 %N, i32 %M, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +; CHECK-LABEL: test10 +entry: + %cmp24 = icmp eq i32 %N, 0 + br i1 %cmp24, label %for.cond.cleanup, label %for.body.lr.ph +; CHECK: entry: +; CHECK: br i1 {{.*}}, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %cmp222 = icmp eq i32 %M, 0 + br i1 %cmp222, label %for.body.lr.ph.split, label %for.body.us +; CHECK: for.body.lr.ph: +; CHECK: br i1 {{.*}}, label %for.body.lr.ph.split, label %for.body.us.lver.check + +; Overflow check block +; CHECK: for.body.us.lver.check: +; CHECK: call { i32, i1 } @llvm.umul.with.overflow.i32 +; CHECK: br i1 %{{.*}}, label %for.body.us.ph.lver.orig, label %for.body.us.ph + +; Preheader for original outer loop +; CHECK: for.body.us.ph.lver.orig: +; CHECK: br label %for.body.us.lver.orig + +; Original loop +for.body.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.lr.ph.split.us + %i.025.us = phi i32 [ 0, %for.body.lr.ph ], [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ] + %mul.us = mul i32 %i.025.us, %M + br label %for.body4.us +; CHECK: for.body.us.lver.orig: +; CHECK: br label %for.body4.us.lver.orig + +for.body4.us: ; preds = %for.body.us, %for.body4.us + %j.023.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body4.us ] + %add.us = add i32 %j.023.us, %mul.us + %arrayidx.us = getelementptr i32, i32* %A, i32 %add.us + %0 = load i32, i32* %arrayidx.us, align 4 + %mul5.us = mul nsw i32 %0, %scale + %arrayidx8.us = getelementptr i32, i32* %C, i32 %add.us + store i32 %mul5.us, i32* %arrayidx8.us, align 4 + %inc.us = add nuw nsw i32 %j.023.us, 1 + %exitcond = icmp ne i32 %inc.us, %M + br i1 %exitcond, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us +; CHECK: for.body4.us.lver.orig: +; CHECK: br i1 %exitcond.lver.orig, label %for.body4.us.lver.orig, label %for.cond1.for.cond.cleanup3_crit_edge.us.lver.orig + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %inc10.us = add nuw nsw i32 %i.025.us, 1 + %exitcond27 = icmp ne i32 %inc10.us, %N + br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit +; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.lver.orig: +; CHECK: br i1 %exitcond27.lver.orig, label %for.body.us.lver.orig, label %for.cond.cleanup.loopexit + +; New loop preheader +; CHECK: for.body.us.ph: +; CHECK: br label %for.body.us + +; Body of new loop +; CHECK: for.body.us: +; CHECK: %[[OUTER_IV:.*]] = phi i32 +; CHECK: br label %for.body4.us +; CHECK: for.body4.us: +; CHECK: getelementptr i32, i32* %A, i32 %[[OUTER_IV]] +; CHECK: getelementptr i32, i32* %C, i32 %[[OUTER_IV]] +; CHECK: br label %for.cond1.for.cond.cleanup3_crit_edge.us +; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us: +; CHECK: br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit + + +for.body.lr.ph.split: ; preds = %for.body.lr.ph + br label %for.cond.cleanup.loopexit26 +; CHECK: for.body.lr.ph.split: +; CHECK: br label %for.cond.cleanup.loopexit26 + +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup +; CHECK: for.cond.cleanup.loopexit: +; CHECK: br label %for.cond.cleanup + +for.cond.cleanup.loopexit26: ; preds = %for.body.lr.ph.split + br label %for.cond.cleanup +; CHECK: for.cond.cleanup.loopexit26: +; CHECK: br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit26, %for.cond.cleanup.loopexit, %entry + ret void +; CHECK: for.cond.cleanup: +; CHECK: ret void +} + declare i32 @func(i32)