Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -234,7 +234,7 @@ bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; void eliminateMostlyEmptyBlock(BasicBlock *BB); bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, - bool isPreheader); + bool isPreheader, bool isLatch); bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT); bool optimizeInst(Instruction *I, bool &ModifiedDT); bool optimizeMemoryInst(Instruction *I, Value *Addr, @@ -645,12 +645,15 @@ /// blocks so we can split them the way we want them. bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { SmallPtrSet Preheaders; + SmallPtrSet Latches; SmallVector LoopList(LI->begin(), LI->end()); while (!LoopList.empty()) { Loop *L = LoopList.pop_back_val(); LoopList.insert(LoopList.end(), L->begin(), L->end()); if (BasicBlock *Preheader = L->getLoopPreheader()) Preheaders.insert(Preheader); + if (BasicBlock *Latch = L->getLoopLatch()) + Latches.insert(Latch); } bool MadeChange = false; @@ -658,8 +661,8 @@ for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { BasicBlock *BB = &*I++; BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB); - if (!DestBB || - !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB))) + if (!DestBB || !isMergingEmptyBlockProfitable( + BB, DestBB, Preheaders.count(BB), Latches.count(DestBB))) continue; eliminateMostlyEmptyBlock(BB); @@ -670,7 +673,8 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, - bool isPreheader) { + bool isPreheader, + bool isLatch) { // Do not delete loop preheaders if doing so would create a critical edge. // Loop preheaders can be good locations to spill registers. If the // preheader is deleted and we create a critical edge, registers may be @@ -694,6 +698,11 @@ isa(Pred->getTerminator()))) return true; + // If the destination block is almost empty latch block then we can hoist + // the jump through the backedge, so it is profitable to merge. + if (DestBB->getTerminator() == DestBB->getFirstNonPHI() && isLatch) + return true; + if (BB->getTerminator() != BB->getFirstNonPHI()) return true; Index: lib/Transforms/Scalar/JumpThreading.cpp =================================================================== --- lib/Transforms/Scalar/JumpThreading.cpp +++ lib/Transforms/Scalar/JumpThreading.cpp @@ -237,7 +237,9 @@ if (BI && BI->isUnconditional() && BB != &BB->getParent()->getEntryBlock() && // If the terminator is the only non-phi instruction, try to nuke it. - BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB)) { + BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB) && + !LoopHeaders.count( + cast(BB->getTerminator())->getSuccessor(0))) { // FIXME: It is always conservatively correct to drop the info // for a block even if it doesn't get erased. This isn't totally // awesome, but it allows us to use AssertingVH to prevent nasty Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -5656,20 +5656,22 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder) { BasicBlock *BB = BI->getParent(); + BasicBlock *Succ = cast(BB->getTerminator())->getSuccessor(0); if (SinkCommon && SinkThenElseCodeToEnd(BI)) return true; // If the Terminator is the only non-phi instruction, simplify the block. - // if LoopHeader is provided, check if the block is a loop header - // (This is for early invocations before loop simplify and vectorization - // to keep canonical loop forms for nested loops. - // These blocks can be eliminated when the pass is invoked later - // in the back-end.) + // if LoopHeader is provided, check if the block or its successor is a loop + // header (This is for early invocations before loop simplify and + // vectorization to keep canonical loop forms for nested loops. These blocks + // can be eliminated when the pass is invoked later in the back-end.) + bool NeedCanonicalLoop = + !LateSimplifyCFG && + (LoopHeaders && (LoopHeaders->count(BB) || LoopHeaders->count(Succ))); BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator(); if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() && - (!LoopHeaders || !LoopHeaders->count(BB)) && - TryToSimplifyUncondBranchFromEmptyBlock(BB)) + !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB)) return true; // If the only instruction in the block is a seteq/setne comparison Index: test/CodeGen/AArch64/aarch64-loop-gep-opt.ll =================================================================== --- test/CodeGen/AArch64/aarch64-loop-gep-opt.ll +++ test/CodeGen/AArch64/aarch64-loop-gep-opt.ll @@ -19,9 +19,9 @@ do.body.i: ; CHECK-LABEL: do.body.i: -; CHECK: %uglygep2 = getelementptr i8, i8* %uglygep, i64 %3 -; CHECK-NEXT: %4 = bitcast i8* %uglygep2 to i32* -; CHECK-NOT: %uglygep2 = getelementptr i8, i8* %uglygep, i64 1032 +; CHECK: %uglygep1 = getelementptr i8, i8* %uglygep, i64 %3 +; CHECK-NEXT: %4 = bitcast i8* %uglygep1 to i32* +; CHECK-NOT: %uglygep1 = getelementptr i8, i8* %uglygep, i64 1032 %0 = phi i32 [ 256, %entry ], [ %.be, %do.body.i.backedge ] Index: test/Transforms/JumpThreading/static-profile.ll =================================================================== --- test/Transforms/JumpThreading/static-profile.ll +++ test/Transforms/JumpThreading/static-profile.ll @@ -86,7 +86,7 @@ ; Verify the new backedge: ; CHECK: check_2.thread: ; CHECK-NEXT: call void @bar() -; CHECK-NEXT: br label %check_1 +; CHECK-NEXT: br label %check_3.thread check_2: %cond2 = icmp eq i32 %v, 2 @@ -100,7 +100,7 @@ ; Verify the new backedge: ; CHECK: eq_2: ; CHECK-NEXT: call void @bar() -; CHECK-NEXT: br label %check_1 +; CHECK-NEXT: br label %check_3.thread check_3: %condE = icmp eq i32 %v, 3 Index: test/Transforms/LoopUnroll/peel-loop.ll =================================================================== --- test/Transforms/LoopUnroll/peel-loop.ll +++ test/Transforms/LoopUnroll/peel-loop.ll @@ -18,9 +18,11 @@ ; CHECK: %[[INC2:.*]] = getelementptr inbounds i32, i32* %p, i64 2 ; CHECK: store i32 2, i32* %[[INC2]], align 4 ; CHECK: %[[CMP3:.*]] = icmp eq i32 %k, 3 -; CHECK: br i1 %[[CMP3]], label %for.end, label %[[LOOP:.*]] +; CHECK: br i1 %[[CMP3]], label %for.end, label %[[LOOP_PH:.*]] +; CHECK: [[LOOP_PH]]: +; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK: %[[IV:.*]] = phi i32 [ {{.*}}, %[[LOOP]] ], [ 3, %[[NEXT2]] ] +; CHECK: %[[IV:.*]] = phi i32 [ 3, %[[LOOP_PH]] ], [ {{.*}}, %[[LOOP]] ] define void @basic(i32* %p, i32 %k) #0 { entry: @@ -65,9 +67,11 @@ ; CHECK: %[[INC2:.*]] = getelementptr inbounds i32, i32* %p, i64 2 ; CHECK: store i32 2, i32* %[[INC2]], align 4 ; CHECK: %[[CMP3:.*]] = icmp eq i32 %k, 3 -; CHECK: br i1 %[[CMP3]], label %for.end, label %[[LOOP:.*]] +; CHECK: br i1 %[[CMP3]], label %for.end, label %[[LOOP_PH:.*]] +; CHECK: [[LOOP_PH]]: +; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK: %[[IV:.*]] = phi i32 [ %[[IV:.*]], %[[LOOP]] ], [ 3, %[[NEXT2]] ] +; CHECK: %[[IV:.*]] = phi i32 [ 3, %[[LOOP_PH]] ], [ %[[IV:.*]], %[[LOOP]] ] ; CHECK: %ret = phi i32 [ 0, %entry ], [ 1, %[[NEXT0]] ], [ 2, %[[NEXT1]] ], [ 3, %[[NEXT2]] ], [ %[[IV]], %[[LOOP]] ] ; CHECK: ret i32 %ret define i32 @output(i32* %p, i32 %k) #0 { Index: test/Transforms/LoopUnroll/pr33605.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/pr33605.ll @@ -0,0 +1,54 @@ +; RUN: opt < %s -disable-output -stats -simplifycfg -loop-rotate -loop-unroll -info-output-file - | FileCheck %s --check-prefix=STATS + +; Test that the loop can be unrolled. We need to skip simplifying unconditional branches from empty blocks in simplifyCFG to do it. + +; STATS: 1 loop-rotate - Number of loops rotated +; STATS: 1 loop-unroll - Number of loops completely unrolled + +; void foo(); +; bool test(int a, int b, int *c) { +; bool changed = false; +; for (unsigned int i = 2; i--;) { +; int r = a | b; +; if ( r != c[i]) { +; c[i] = r; +; foo(); +; changed = true; +; } +; } +; return changed; +; } +define i1 @test(i32 %a, i32 %b, i32* %c) { +entry: + br label %for.cond + +for.cond: ; preds = %if.end, %entry + %i.0 = phi i32 [ 2, %entry ], [ %dec, %if.end ] + %changed.0.off0 = phi i1 [ false, %entry ], [ %changed.1.off0, %if.end ] + %dec = add nsw i32 %i.0, -1 + %tobool = icmp eq i32 %i.0, 0 + br i1 %tobool, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.cond + %changed.0.off0.lcssa = phi i1 [ %changed.0.off0, %for.cond ] + ret i1 %changed.0.off0.lcssa + +for.body: ; preds = %for.cond + %or = or i32 %a, %b + %idxprom = sext i32 %dec to i64 + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %cmp = icmp eq i32 %or, %0 + br i1 %cmp, label %if.end, label %if.then + +if.then: ; preds = %for.body + store i32 %or, i32* %arrayidx, align 4 + call void @foo() + br label %if.end + +if.end: ; preds = %for.body, %if.then + %changed.1.off0 = phi i1 [ true, %if.then ], [ %changed.0.off0, %for.body ] + br label %for.cond +} + +declare void @foo() Index: test/Transforms/LoopUnswitch/2015-06-17-Metadata.ll =================================================================== --- test/Transforms/LoopUnswitch/2015-06-17-Metadata.ll +++ test/Transforms/LoopUnswitch/2015-06-17-Metadata.ll @@ -16,7 +16,7 @@ %cmp1 = icmp eq i32 %a, 12345 br i1 %cmp1, label %if.then, label %if.else, !prof !0 ; CHECK: %cmp1 = icmp eq i32 %a, 12345 -; CHECK-NEXT: br i1 %cmp1, label %for.body.us, label %for.body, !prof !0 +; CHECK-NEXT: br i1 %cmp1, label %for.body.preheader.split.us, label %for.body.preheader.split, !prof !0 if.then: ; preds = %for.body ; CHECK: for.body.us: ; CHECK: add nsw i32 %{{.*}}, 123 @@ -53,7 +53,7 @@ br label %for.body ;CHECK: entry: ;CHECK-NEXT: %cmp1 = icmp eq i32 1, 2 -;CHECK-NEXT: br i1 %cmp1, label %for.body, label %for.cond.cleanup.split, !prof !1 +;CHECK-NEXT: br i1 %cmp1, label %entry.split, label %for.cond.cleanup.split, !prof !1 ;CHECK: for.body: for.body: ; preds = %for.inc, %entry %inc.i = phi i32 [ 0, %entry ], [ %inc, %if.then ] Index: test/Transforms/LoopUnswitch/infinite-loop.ll =================================================================== --- test/Transforms/LoopUnswitch/infinite-loop.ll +++ test/Transforms/LoopUnswitch/infinite-loop.ll @@ -6,7 +6,7 @@ ; Loop unswitching shouldn't trivially unswitch the true case of condition %a ; in the code here because it leads to an infinite loop. While this doesn't ; contain any instructions with side effects, it's still a kind of side effect. -; It can trivially unswitch on the false cas of condition %a though. +; It can trivially unswitch on the false case of condition %a though. ; STATS: 2 loop-unswitch - Number of branches unswitched ; STATS: 2 loop-unswitch - Number of unswitches that are trivial @@ -16,7 +16,7 @@ ; CHECK-NEXT: br i1 %a, label %entry.split, label %abort0.split ; CHECK: entry.split: -; CHECK-NEXT: br i1 %b, label %for.body, label %abort1.split +; CHECK-NEXT: br i1 %b, label %entry.split.split, label %abort1.split ; CHECK: for.body: ; CHECK-NEXT: br label %for.body Index: test/Transforms/LoopVectorize/X86/float-induction-x86.ll =================================================================== --- test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -O3 -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix AUTO_VEC %s +; RUN: opt < %s -O3 -latesimplifycfg -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix AUTO_VEC %s ; This test checks auto-vectorization with FP induction variable. ; The FP operation is not "fast" and requires "fast-math" function attribute. Index: test/Transforms/LoopVectorize/float-induction.ll =================================================================== --- test/Transforms/LoopVectorize/float-induction.ll +++ test/Transforms/LoopVectorize/float-induction.ll @@ -1,7 +1,7 @@ ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL1 %s ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL2 %s ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -dce -instcombine -S | FileCheck --check-prefix VEC1_INTERL2 %s -; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -dce -simplifycfg -instcombine -S | FileCheck --check-prefix VEC2_INTERL1_PRED_STORE %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -dce -simplifycfg -instcombine -latesimplifycfg -S | FileCheck --check-prefix VEC2_INTERL1_PRED_STORE %s @fp_inc = common global float 0.000000e+00, align 4 Index: test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll =================================================================== --- test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll +++ test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll @@ -1322,8 +1322,8 @@ ; Speculation depth must be limited to avoid a zero-cost instruction cycle. ; CHECK-LABEL: @PR26308( -; CHECK: while.body: -; CHECK-NEXT: br label %while.body +; CHECK: cleanup4: +; CHECK-NEXT: br label %cleanup4 define i32 @PR26308(i1 %B, i64 %load) { entry: Index: test/Transforms/SimplifyCFG/multiple-phis.ll =================================================================== --- test/Transforms/SimplifyCFG/multiple-phis.ll +++ test/Transforms/SimplifyCFG/multiple-phis.ll @@ -1,4 +1,4 @@ -; RUN: opt -simplifycfg -S < %s | FileCheck %s +; RUN: opt -latesimplifycfg -S < %s | FileCheck %s ; It's not worthwhile to if-convert one of the phi nodes and leave ; the other behind, because that still requires a branch. If Index: test/Transforms/SimplifyCFG/preserve-llvm-loop-metadata.ll =================================================================== --- test/Transforms/SimplifyCFG/preserve-llvm-loop-metadata.ll +++ test/Transforms/SimplifyCFG/preserve-llvm-loop-metadata.ll @@ -1,4 +1,4 @@ -; RUN: opt -simplifycfg -S < %s | FileCheck %s +; RUN: opt -latesimplifycfg -S < %s | FileCheck %s define void @test1(i32 %n) #0 { entry: