Index: llvm/trunk/include/llvm/Analysis/Utils/Local.h =================================================================== --- llvm/trunk/include/llvm/Analysis/Utils/Local.h +++ llvm/trunk/include/llvm/Analysis/Utils/Local.h @@ -141,6 +141,18 @@ bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI = nullptr); +/// Delete all of the instructions in `DeadInsts`, and all other instructions +/// that deleting these in turn causes to be trivially dead. +/// +/// The initial instructions in the provided vector must all have empty use +/// lists and satisfy `isInstructionTriviallyDead`. +/// +/// `DeadInsts` will be used as scratch storage for this routine and will be +/// empty afterward. +void RecursivelyDeleteTriviallyDeadInstructions( + SmallVectorImpl &DeadInsts, + const TargetLibraryInfo *TLI = nullptr); + /// If the specified value is an effectively dead PHI node, due to being a /// def-use chain of single-use nodes that either forms a cycle or is terminated /// by a trivially dead instruction, delete it. If that makes any of its Index: llvm/trunk/lib/Transforms/Scalar/LoopInstSimplify.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/Utils/Local.h" @@ -45,118 +46,116 @@ STATISTIC(NumSimplified, "Number of redundant instructions simplified"); -static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI, - AssumptionCache *AC, - const TargetLibraryInfo *TLI) { - SmallVector ExitBlocks; - L->getUniqueExitBlocks(ExitBlocks); - array_pod_sort(ExitBlocks.begin(), ExitBlocks.end()); - +static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI, + AssumptionCache &AC, + const TargetLibraryInfo &TLI) { + const DataLayout &DL = L.getHeader()->getModule()->getDataLayout(); + SimplifyQuery SQ(DL, &TLI, &DT, &AC); + + // On the first pass over the loop body we try to simplify every instruction. + // On subsequent passes, we can restrict this to only simplifying instructions + // where the inputs have been updated. We end up needing two sets: one + // containing the instructions we are simplifying in *this* pass, and one for + // the instructions we will want to simplify in the *next* pass. We use + // pointers so we can swap between two stably allocated sets. SmallPtrSet S1, S2, *ToSimplify = &S1, *Next = &S2; - // The bit we are stealing from the pointer represents whether this basic - // block is the header of a subloop, in which case we only process its phis. - using WorklistItem = PointerIntPair; - SmallVector VisitStack; - SmallPtrSet Visited; + // Track the PHI nodes that have already been visited during each iteration so + // that we can identify when it is necessary to iterate. + SmallPtrSet VisitedPHIs; + + // While simplifying we may discover dead code or cause code to become dead. + // Keep track of all such instructions and we will delete them at the end. + SmallVector DeadInsts; + + // First we want to create an RPO traversal of the loop body. By processing in + // RPO we can ensure that definitions are processed prior to uses (for non PHI + // uses) in all cases. This ensures we maximize the simplifications in each + // iteration over the loop and minimizes the possible causes for continuing to + // iterate. + LoopBlocksRPO RPOT(&L); + RPOT.perform(&LI); bool Changed = false; - bool LocalChanged; - do { - LocalChanged = false; - - VisitStack.clear(); - Visited.clear(); - - VisitStack.push_back(WorklistItem(L->getHeader(), false)); - - while (!VisitStack.empty()) { - WorklistItem Item = VisitStack.pop_back_val(); - BasicBlock *BB = Item.getPointer(); - bool IsSubloopHeader = Item.getInt(); - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - - // Simplify instructions in the current basic block. - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - Instruction *I = &*BI++; - - // The first time through the loop ToSimplify is empty and we try to - // simplify all instructions. On later iterations ToSimplify is not - // empty and we only bother simplifying instructions that are in it. - if (!ToSimplify->empty() && !ToSimplify->count(I)) + for (;;) { + for (BasicBlock *BB : RPOT) { + for (Instruction &I : *BB) { + if (auto *PI = dyn_cast(&I)) + VisitedPHIs.insert(PI); + + if (I.use_empty()) { + if (isInstructionTriviallyDead(&I, &TLI)) + DeadInsts.push_back(&I); continue; - - // Don't bother simplifying unused instructions. - if (!I->use_empty()) { - Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC}); - if (V && LI->replacementPreservesLCSSAForm(I, V)) { - // Mark all uses for resimplification next time round the loop. - for (User *U : I->users()) - Next->insert(cast(U)); - - I->replaceAllUsesWith(V); - LocalChanged = true; - ++NumSimplified; - } - } - if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) { - // RecursivelyDeleteTriviallyDeadInstruction can remove more than one - // instruction, so simply incrementing the iterator does not work. - // When instructions get deleted re-iterate instead. - BI = BB->begin(); - BE = BB->end(); - LocalChanged = true; } - if (IsSubloopHeader && !isa(I)) - break; - } + // We special case the first iteration which we can detect due to the + // empty `ToSimplify` set. + bool IsFirstIteration = ToSimplify->empty(); - // Add all successors to the worklist, except for loop exit blocks and the - // bodies of subloops. We visit the headers of loops so that we can - // process - // their phis, but we contract the rest of the subloop body and only - // follow - // edges leading back to the original loop. - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; - ++SI) { - BasicBlock *SuccBB = *SI; - if (!Visited.insert(SuccBB).second) + if (!IsFirstIteration && !ToSimplify->count(&I)) continue; - const Loop *SuccLoop = LI->getLoopFor(SuccBB); - if (SuccLoop && SuccLoop->getHeader() == SuccBB && - L->contains(SuccLoop)) { - VisitStack.push_back(WorklistItem(SuccBB, true)); - - SmallVector SubLoopExitBlocks; - SuccLoop->getExitBlocks(SubLoopExitBlocks); - - for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) { - BasicBlock *ExitBB = SubLoopExitBlocks[i]; - if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second) - VisitStack.push_back(WorklistItem(ExitBB, false)); - } - + Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I)); + if (!V || !LI.replacementPreservesLCSSAForm(&I, V)) continue; - } - bool IsExitBlock = - std::binary_search(ExitBlocks.begin(), ExitBlocks.end(), SuccBB); - if (IsExitBlock) - continue; + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); + UI != UE;) { + Use &U = *UI++; + auto *UserI = cast(U.getUser()); + U.set(V); + + // If the instruction is used by a PHI node we have already processed + // we'll need to iterate on the loop body to converge, so add it to + // the next set. + if (auto *UserPI = dyn_cast(UserI)) + if (VisitedPHIs.count(UserPI)) { + Next->insert(UserPI); + continue; + } + + // If we are only simplifying targeted instructions and the user is an + // instruction in the loop body, add it to our set of targeted + // instructions. Because we process defs before uses (outside of PHIs) + // we won't have visited it yet. + // + // We also skip any uses outside of the loop being simplified. Those + // should always be PHI nodes due to LCSSA form, and we don't want to + // try to simplify those away. + assert((L.contains(UserI) || isa(UserI)) && + "Uses outside the loop should be PHI nodes due to LCSSA!"); + if (!IsFirstIteration && L.contains(UserI)) + ToSimplify->insert(UserI); + } - VisitStack.push_back(WorklistItem(SuccBB, false)); + assert(I.use_empty() && "Should always have replaced all uses!"); + if (isInstructionTriviallyDead(&I, &TLI)) + DeadInsts.push_back(&I); + ++NumSimplified; + Changed = true; } } - // Place the list of instructions to simplify on the next loop iteration - // into ToSimplify. - std::swap(ToSimplify, Next); - Next->clear(); + // Delete any dead instructions found thus far now that we've finished an + // iteration over all instructions in all the loop blocks. + if (!DeadInsts.empty()) { + Changed = true; + RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI); + } - Changed |= LocalChanged; - } while (LocalChanged); + // If we never found a PHI that needs to be simplified in the next + // iteration, we're done. + if (Next->empty()) + break; + + // Otherwise, put the next set in place for the next iteration and reset it + // and the visited PHIs for that iteration. + std::swap(Next, ToSimplify); + Next->clear(); + VisitedPHIs.clear(); + DeadInsts.clear(); + } return Changed; } @@ -174,21 +173,20 @@ bool runOnLoop(Loop *L, LPPassManager &LPM) override { if (skipLoop(L)) return false; - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable(); - DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - LoopInfo *LI = &getAnalysis().getLoopInfo(); - AssumptionCache *AC = - &getAnalysis().getAssumptionCache( + DominatorTree &DT = getAnalysis().getDomTree(); + LoopInfo &LI = getAnalysis().getLoopInfo(); + AssumptionCache &AC = + getAnalysis().getAssumptionCache( *L->getHeader()->getParent()); - const TargetLibraryInfo *TLI = - &getAnalysis().getTLI(); + const TargetLibraryInfo &TLI = + getAnalysis().getTLI(); - return SimplifyLoopInst(L, DT, LI, AC, TLI); + return simplifyLoopInst(*L, DT, LI, AC, TLI); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.setPreservesCFG(); getLoopAnalysisUsage(AU); @@ -200,7 +198,7 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { - if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI)) + if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI)) return PreservedAnalyses::all(); auto PA = getLoopPassPreservedAnalyses(); Index: llvm/trunk/lib/Transforms/Utils/Local.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/Local.cpp +++ llvm/trunk/lib/Transforms/Utils/Local.cpp @@ -434,18 +434,31 @@ SmallVector DeadInsts; DeadInsts.push_back(I); + RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI); - do { - I = DeadInsts.pop_back_val(); - salvageDebugInfo(*I); + return true; +} + +void llvm::RecursivelyDeleteTriviallyDeadInstructions( + SmallVectorImpl &DeadInsts, const TargetLibraryInfo *TLI) { + // Process the dead instruction list until empty. + while (!DeadInsts.empty()) { + Instruction &I = *DeadInsts.pop_back_val(); + assert(I.use_empty() && "Instructions with uses are not dead."); + assert(isInstructionTriviallyDead(&I, TLI) && + "Live instruction found in dead worklist!"); + + // Don't lose the debug info while deleting the instructions. + salvageDebugInfo(I); // Null out all of the instruction's operands to see if any operand becomes // dead as we go. - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - Value *OpV = I->getOperand(i); - I->setOperand(i, nullptr); + for (Use &OpU : I.operands()) { + Value *OpV = OpU.get(); + OpU.set(nullptr); - if (!OpV->use_empty()) continue; + if (!OpV->use_empty()) + continue; // If the operand is an instruction that became dead as we nulled out the // operand, and if it is 'trivially' dead, delete it in a future loop @@ -455,10 +468,8 @@ DeadInsts.push_back(OpI); } - I->eraseFromParent(); - } while (!DeadInsts.empty()); - - return true; + I.eraseFromParent(); + } } /// areAllUsesEqual - Check whether the uses of a value are all the same. Index: llvm/trunk/test/Transforms/LoopInstSimplify/basic.ll =================================================================== --- llvm/trunk/test/Transforms/LoopInstSimplify/basic.ll +++ llvm/trunk/test/Transforms/LoopInstSimplify/basic.ll @@ -0,0 +1,164 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S %s -passes=loop-instsimplify | FileCheck %s + +; Test very basic folding and propagation occurs within a loop body. This should +; collapse to the loop iteration structure and the LCSSA PHI node. +define i32 @test1(i32 %n, i32 %x) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I_NEXT]] = add nsw i32 [[I]], 1 +; CHECK-NEXT: [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP]] ] +; CHECK-NEXT: ret i32 [[X_LCSSA]] +; +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %x.add = add nsw i32 %x, 0 + %x.sub = sub i32 %x.add, 0 + %x.and = and i32 %x.sub, -1 + %i.next = add nsw i32 %i, 1 + %i.cmp = icmp slt i32 %i.next, %n + br i1 %i.cmp, label %loop, label %exit + +exit: + %x.lcssa = phi i32 [ %x.and, %loop ] + ret i32 %x.lcssa +} + +; Test basic loop structure that still has a simplification feed a prior PHI. +define i32 @test2(i32 %n, i32 %x) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I_NEXT]] = add nsw i32 [[I]], 1 +; CHECK-NEXT: [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP]] ] +; CHECK-NEXT: ret i32 [[X_LCSSA]] +; +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %x.loop = phi i32 [ %x, %entry ], [ %x.next, %loop ] + %x.next = add nsw i32 %x.loop, 0 + %i.next = add nsw i32 %i, 1 + %i.cmp = icmp slt i32 %i.next, %n + br i1 %i.cmp, label %loop, label %exit + +exit: + %x.lcssa = phi i32 [ %x.loop, %loop ] + ret i32 %x.lcssa +} + +; Test a diamond CFG with inner PHI nodes. +define i32 @test3(i32 %n, i32 %x) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[X_CMP:%.*]] = icmp slt i32 [[I]], 42 +; CHECK-NEXT: br i1 [[X_CMP]], label [[LOOP_LHS:%.*]], label [[LOOP_RHS:%.*]] +; CHECK: loop.lhs: +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.rhs: +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[I_NEXT]] = add nsw i32 [[I]], 1 +; CHECK-NEXT: [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP_LATCH]] ] +; CHECK-NEXT: ret i32 [[X_LCSSA]] +; +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop.latch ] + %x.loop = phi i32 [ %x, %entry ], [ %x.phi, %loop.latch ] + %x.add = add nsw i32 %x.loop, 0 + %x.cmp = icmp slt i32 %i, 42 + br i1 %x.cmp, label %loop.lhs, label %loop.rhs + +loop.lhs: + %x.l.add = add nsw i32 %x.add, 0 + br label %loop.latch + +loop.rhs: + %x.r.sub = sub nsw i32 %x.add, 0 + br label %loop.latch + +loop.latch: + %x.phi = phi i32 [ %x.l.add, %loop.lhs ], [ %x.r.sub, %loop.rhs ] + %i.next = add nsw i32 %i, 1 + %i.cmp = icmp slt i32 %i.next, %n + br i1 %i.cmp, label %loop, label %exit + +exit: + %x.lcssa = phi i32 [ %x.loop, %loop.latch ] + ret i32 %x.lcssa +} + +; Test an inner loop that is only simplified when processing the outer loop, and +; an outer loop only simplified when processing the inner loop. +define i32 @test4(i32 %n, i32 %m, i32 %x) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: br label [[LOOP_INNER:%.*]] +; CHECK: loop.inner: +; CHECK-NEXT: [[J:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[J_NEXT:%.*]], [[LOOP_INNER]] ] +; CHECK-NEXT: [[J_NEXT]] = add nsw i32 [[J]], 1 +; CHECK-NEXT: [[J_CMP:%.*]] = icmp slt i32 [[J_NEXT]], [[M:%.*]] +; CHECK-NEXT: br i1 [[J_CMP]], label [[LOOP_INNER]], label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[I_NEXT]] = add nsw i32 [[I]], 1 +; CHECK-NEXT: [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP_LATCH]] ] +; CHECK-NEXT: ret i32 [[X_LCSSA]] +; +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop.latch ] + %x.loop = phi i32 [ %x, %entry ], [ %x.inner.lcssa, %loop.latch ] + %x.add = add nsw i32 %x.loop, 0 + br label %loop.inner + +loop.inner: + %j = phi i32 [ 0, %loop ], [ %j.next, %loop.inner ] + %x.inner.loop = phi i32 [ %x.add, %loop ], [ %x.inner.add, %loop.inner ] + %x.inner.add = add nsw i32 %x.inner.loop, 0 + %j.next = add nsw i32 %j, 1 + %j.cmp = icmp slt i32 %j.next, %m + br i1 %j.cmp, label %loop.inner, label %loop.latch + +loop.latch: + %x.inner.lcssa = phi i32 [ %x.inner.loop, %loop.inner ] + %i.next = add nsw i32 %i, 1 + %i.cmp = icmp slt i32 %i.next, %n + br i1 %i.cmp, label %loop, label %exit + +exit: + %x.lcssa = phi i32 [ %x.loop, %loop.latch ] + ret i32 %x.lcssa +}