Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -1515,6 +1516,46 @@ SafetyInfo->BlockColors = colorEHFunclets(*Fn); } +/// Return true if we can prove that the given ExitBlock is not reached on the +/// first iteration of the given loop. That is, the backedge of the loop must +/// be executed before the ExitBlock is executed in any dynamic execution trace. +static bool CanProveNotTakenFirstIteration(BasicBlock *ExitBlock, + const DominatorTree *DT, + const Loop *CurLoop) { + auto *CondExitBlock = ExitBlock->getSinglePredecessor(); + if (!CondExitBlock) + // expect unique exits + return false; + assert(CurLoop->contains(CondExitBlock) && "meaning of exit block"); + auto *BI = dyn_cast(CondExitBlock->getTerminator()); + if (!BI || !BI->isConditional()) + return false; + // todo: handle fcmp someday + // todo: this would be a lot more powerful if we used scev, but all the + // plumbing is currently missing to pass a pointer in from the pass + auto *ICI = dyn_cast(BI->getCondition()); + if (!ICI) + return false; + // Check for cmp (phi [x, preheader] ...), y where (pred x, y is known + auto *LHS = dyn_cast(ICI->getOperand(0)); + auto *RHS = ICI->getOperand(1); + if (!LHS || LHS->getParent() != CurLoop->getHeader()) + return false; + auto DL = ExitBlock->getModule()->getDataLayout(); + auto *IVStart = LHS->getIncomingValueForBlock(CurLoop->getLoopPreheader()); + auto *SimpleValOrNull = SimplifyICmpInst(ICI->getPredicate(), + IVStart, RHS, + {DL, /*TLI*/ nullptr, + DT, /*AC*/ nullptr, BI}); + auto *SimpleCst = dyn_cast_or_null(SimpleValOrNull); + if (!SimpleCst) + return false; + if (ExitBlock == BI->getSuccessor(0)) + return SimpleCst->isZeroValue(); + assert(ExitBlock == BI->getSuccessor(1) && "implied by above"); + return SimpleCst->isAllOnesValue(); +} + /// Returns true if the instruction in a loop is guaranteed to execute at least /// once. bool llvm::isGuaranteedToExecute(const Instruction &Inst, @@ -1537,6 +1578,22 @@ if (SafetyInfo->MayThrow) return false; + // Note: There are two styles of reasoning intermixed below for + // implementation efficiency reasons. They are: + // 1) If we can prove that the instruction dominates all exit blocks, then we + // know the instruction must have executed on *some* iteration before we + // exit. We do not prove *which* iteration the instruction must execute on. + // 2) If we can prove that the instruction dominates the latch and all exits + // which might be taken on the first iteration, we know the instruction must + // execute on the first iteration. This second style allows a conditional + // exit before the instruction of interest which is provably not taken on the + // first iteration. This is a quite common case for range check like + // patterns. TODO: support loops with multiple latches. + + const bool InstDominatesLatch = + CurLoop->getLoopLatch() != nullptr && + DT->dominates(Inst.getParent(), CurLoop->getLoopLatch()); + // Get the exit blocks for the current loop. SmallVector ExitBlocks; CurLoop->getExitBlocks(ExitBlocks); @@ -1544,7 +1601,9 @@ // Verify that the block dominates each of the exit blocks of the loop. for (BasicBlock *ExitBlock : ExitBlocks) if (!DT->dominates(Inst.getParent(), ExitBlock)) - return false; + if (!InstDominatesLatch || + !CanProveNotTakenFirstIteration(ExitBlock, DT, CurLoop)) + return false; // As a degenerate case, if the loop is statically infinite then we haven't // proven anything since there are no exit blocks. Index: test/Transforms/LICM/hoist-mustexec.ll =================================================================== --- test/Transforms/LICM/hoist-mustexec.ll +++ test/Transforms/LICM/hoist-mustexec.ll @@ -0,0 +1,249 @@ +; RUN: opt -S -basicaa -licm < %s | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa -passes='require,loop(licm)' -S %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @f() nounwind + +; constant fold on first ieration +define i32 @test1(i32* noalias nocapture readonly %a) nounwind uwtable { +; CHECK-LABEL: @test1( +entry: +; CHECK: %i1 = load i32, i32* %a, align 4 +; CHECK-NEXT: br label %for.body + br label %for.body + +for.body: + %iv = phi i32 [ 0, %entry ], [ %inc, %continue ] + %acc = phi i32 [ 0, %entry ], [ %add, %continue ] + %r.chk = icmp ult i32 %iv, 2000 + br i1 %r.chk, label %continue, label %fail +continue: + %i1 = load i32, i32* %a, align 4 + %add = add nsw i32 %i1, %acc + %inc = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %inc, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i32 %add + +fail: + call void @f() + ret i32 -1 +} + +; Count down from a.length w/entry guard +; TODO: currently unable to prove the following: +; ule i32 (add nsw i32 %len, -1), %len where len is [0, 512] +define i32 @test2(i32* noalias nocapture readonly %a) nounwind uwtable { +; CHECK-LABEL: @test2( +entry: + %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512} + %is.non.pos = icmp eq i32 %len, 0 + br i1 %is.non.pos, label %fail, label %preheader +preheader: + %lenminusone = add nsw i32 %len, -1 + br label %for.body +for.body: + %iv = phi i32 [ %lenminusone, %preheader ], [ %dec, %continue ] + %acc = phi i32 [ 0, %preheader ], [ %add, %continue ] + %r.chk = icmp ule i32 %iv, %len + br i1 %r.chk, label %continue, label %fail +continue: +; CHECK-LABEL: continue +; CHECK: %i1 = load i32, i32* %a, align 4 + %i1 = load i32, i32* %a, align 4 + %add = add nsw i32 %i1, %acc + %dec = add nsw i32 %iv, -1 + %exitcond = icmp eq i32 %dec, 0 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i32 %add + +fail: + call void @f() + ret i32 -1 +} + +; trivially true for zero +define i32 @test3(i32* noalias nocapture readonly %a) nounwind uwtable { +; CHECK-LABEL: @test3( +entry: + %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512} + %is.zero = icmp eq i32 %len, 0 + br i1 %is.zero, label %fail, label %preheader +preheader: +; CHECK: %i1 = load i32, i32* %a, align 4 +; CHECK-NEXT: br label %for.body + br label %for.body +for.body: + %iv = phi i32 [ 0, %preheader ], [ %inc, %continue ] + %acc = phi i32 [ 0, %preheader ], [ %add, %continue ] + %r.chk = icmp ule i32 %iv, %len + br i1 %r.chk, label %continue, label %fail +continue: + %i1 = load i32, i32* %a, align 4 + %add = add nsw i32 %i1, %acc + %inc = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %inc, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i32 %add + +fail: + call void @f() + ret i32 -1 +} + +; requires fact length is non-zero +; TODO: IsKnownNonNullFromDominatingConditions is currently only be done for +; pointers; should handle integers too +define i32 @test4(i32* noalias nocapture readonly %a) nounwind uwtable { +; CHECK-LABEL: @test4( +entry: + %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512} + %is.zero = icmp eq i32 %len, 0 + br i1 %is.zero, label %fail, label %preheader +preheader: + br label %for.body +for.body: + %iv = phi i32 [ 0, %preheader ], [ %inc, %continue ] + %acc = phi i32 [ 0, %preheader ], [ %add, %continue ] + %r.chk = icmp ult i32 %iv, %len + br i1 %r.chk, label %continue, label %fail +continue: +; CHECK-LABEL: continue +; CHECK: %i1 = load i32, i32* %a, align 4 + %i1 = load i32, i32* %a, align 4 + %add = add nsw i32 %i1, %acc + %inc = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %inc, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i32 %add + +fail: + call void @f() + ret i32 -1 +} + +; variation on test1 with branch swapped +define i32 @test-brswap(i32* noalias nocapture readonly %a) nounwind uwtable { +; CHECK-LABEL: @test-brswap( +entry: +; CHECK: %i1 = load i32, i32* %a, align 4 +; CHECK-NEXT: br label %for.body + br label %for.body + +for.body: + %iv = phi i32 [ 0, %entry ], [ %inc, %continue ] + %acc = phi i32 [ 0, %entry ], [ %add, %continue ] + %r.chk = icmp ugt i32 %iv, 2000 + br i1 %r.chk, label %fail, label %continue +continue: + %i1 = load i32, i32* %a, align 4 + %add = add nsw i32 %i1, %acc + %inc = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %inc, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i32 %add + +fail: + call void @f() + ret i32 -1 +} + +define i32 @test-nonphi(i32* noalias nocapture readonly %a) nounwind uwtable { +; CHECK-LABEL: @test-nonphi( +entry: + br label %for.body + +for.body: +; CHECK-LABEL: continue +; CHECK: %i1 = load i32, i32* %a, align 4 + %iv = phi i32 [ 0, %entry ], [ %inc, %continue ] + %acc = phi i32 [ 0, %entry ], [ %add, %continue ] + %xor = xor i32 %iv, 72 + %r.chk = icmp ugt i32 %xor, 2000 + br i1 %r.chk, label %fail, label %continue +continue: + %i1 = load i32, i32* %a, align 4 + %add = add nsw i32 %i1, %acc + %inc = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %inc, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i32 %add + +fail: + call void @f() + ret i32 -1 +} + +define i32 @test-wrongphi(i32* noalias nocapture readonly %a) nounwind uwtable { +; CHECK-LABEL: @test-wrongphi( +entry: + br label %for.body + +for.body: + %iv = phi i32 [ 0, %entry ], [ %inc, %continue ] + %acc = phi i32 [ 0, %entry ], [ %add, %continue ] + br label %dummy_block +dummy_block: + %wrongphi = phi i32 [12, %for.body] + %r.chk = icmp ugt i32 %wrongphi, 2000 + br i1 %r.chk, label %fail, label %continue +continue: +; CHECK-LABEL: continue +; CHECK: %i1 = load i32, i32* %a, align 4 + %i1 = load i32, i32* %a, align 4 + %add = add nsw i32 %i1, %acc + %inc = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %inc, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i32 %add + +fail: + call void @f() + ret i32 -1 +} + +; This works because loop-simplify is run implicitly, but test for it anyways +define i32 @test-multiple-latch(i32* noalias nocapture readonly %a) nounwind uwtable { +; CHECK-LABEL: @test-multiple-latch( +entry: +; CHECK: %i1 = load i32, i32* %a, align 4 +; CHECK-NEXT: br label %for.body + br label %for.body + +for.body: + %iv = phi i32 [ 0, %entry ], [ %inc, %continue1 ], [ %inc, %continue2 ] + %acc = phi i32 [ 0, %entry ], [ %add, %continue1 ], [ %add, %continue2 ] + %r.chk = icmp ult i32 %iv, 2000 + br i1 %r.chk, label %continue1, label %fail +continue1: + %i1 = load i32, i32* %a, align 4 + %add = add nsw i32 %i1, %acc + %inc = add nuw nsw i32 %iv, 1 + %cmp = icmp eq i32 %add, 0 + br i1 %cmp, label %continue2, label %for.body +continue2: + %exitcond = icmp eq i32 %inc, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i32 %add + +fail: + call void @f() + ret i32 -1 +}