Index: llvm/include/llvm/Analysis/ScalarEvolution.h =================================================================== --- llvm/include/llvm/Analysis/ScalarEvolution.h +++ llvm/include/llvm/Analysis/ScalarEvolution.h @@ -1180,6 +1180,15 @@ /// Try to apply information from loop guards for \p L to \p Expr. const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L); + /// Test whether the condition described by Pred, LHS, and RHS is true + /// whenever the condition described by FoundPred, FoundLHS, FoundRHS is + /// true in given Context. If Context is nullptr, then the found predicate is + /// true everywhere. + bool isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, + ICmpInst::Predicate FoundPred, const SCEV *FoundLHS, + const SCEV *FoundRHS, + const Instruction *Context = nullptr); + private: /// A CallbackVH to arrange for ScalarEvolution to be notified whenever a /// Value is deleted. @@ -1779,15 +1788,6 @@ const Instruction *Context); /// Test whether the condition described by Pred, LHS, and RHS is true - /// whenever the condition described by FoundPred, FoundLHS, FoundRHS is - /// true in given Context. If Context is nullptr, then the found predicate is - /// true everywhere. - bool isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, - ICmpInst::Predicate FoundPred, const SCEV *FoundLHS, - const SCEV *FoundRHS, - const Instruction *Context = nullptr); - - /// Test whether the condition described by Pred, LHS, and RHS is true /// whenever the condition described by Pred, FoundLHS, and FoundRHS is /// true in given Context. If Context is nullptr, then the found predicate is /// true everywhere. Index: llvm/include/llvm/Transforms/Scalar/InductiveUnrolling.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/Scalar/InductiveUnrolling.h @@ -0,0 +1,21 @@ +#ifndef LLVM_TRANSFORMS_SCALAR_INDUCTIVEUNROLLING_H +#define LLVM_TRANSFORMS_SCALAR_INDUCTIVEUNROLLING_H + +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class Loop; +class LPMUpdater; + +class InductiveUnrollingPass : public PassInfoMixin { +public: + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; + + +} + +#endif // LLVM_TRANSFORMS_SCALAR_INDUCTIVEUNROLLING_H Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -154,6 +154,7 @@ #include "llvm/Transforms/Scalar/IVUsersPrinter.h" #include "llvm/Transforms/Scalar/IndVarSimplify.h" #include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h" +#include "llvm/Transforms/Scalar/InductiveUnrolling.h" #include "llvm/Transforms/Scalar/InferAddressSpaces.h" #include "llvm/Transforms/Scalar/InstSimplifyPass.h" #include "llvm/Transforms/Scalar/JumpThreading.h" Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -399,6 +399,7 @@ LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass()) LOOP_PASS("loop-reduce", LoopStrengthReducePass()) LOOP_PASS("indvars", IndVarSimplifyPass()) +LOOP_PASS("inductive-unroll", InductiveUnrollingPass()) LOOP_PASS("loop-unroll-full", LoopFullUnrollPass()) LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs())) LOOP_PASS("print", DDGAnalysisPrinterPass(dbgs())) Index: llvm/lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/Scalar/CMakeLists.txt +++ llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -19,6 +19,7 @@ GVNSink.cpp IVUsersPrinter.cpp InductiveRangeCheckElimination.cpp + InductiveUnrolling.cpp IndVarSimplify.cpp InferAddressSpaces.cpp InstSimplifyPass.cpp Index: llvm/lib/Transforms/Scalar/InductiveUnrolling.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/Scalar/InductiveUnrolling.cpp @@ -0,0 +1,191 @@ +#include "llvm/Transforms/Scalar/InductiveUnrolling.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" + +using namespace llvm; + +#define DEBUG_TYPE "inductive-unroll" + +struct InductiveCheck { + InductiveCheck(BranchInst *Br, ICmpInst::Predicate Pred, const SCEVAddRecExpr *LHS, const SCEV *RHS) : Br(Br), Pred(Pred), LHS(LHS), RHS(RHS) {} + BranchInst *Br; + ICmpInst::Predicate Pred; + const SCEVAddRecExpr *LHS; + const SCEV *RHS; +}; + +bool skipLoop(const Loop &L, ScalarEvolution &SE) { + if (!L.isInnermost()) + return true; + if (!L.getLoopLatch() || !L.getLoopPreheader()) + return true; + auto *ExitCount = SE.getBackedgeTakenCount(&L); + if (isa(ExitCount)) + return true; + // TODO: support switches. + for (auto *BB : L.getBlocks()) + if (!isa(BB->getTerminator())) + return true; + return false; +} + +bool collectCandidates(SmallVectorImpl &Checks, Loop &L, ScalarEvolution &SE, DominatorTree &DT) { + assert(Checks.empty() && "Supposed to be an empty vector!"); + if (!L.isInnermost()) + return false; + BasicBlock *Latch = L.getLoopLatch(); + assert(Latch && "Should have a latch!"); + + for (BasicBlock *BB : L.getBlocks()) { + if (!DT.dominates(BB, Latch)) + continue; + using namespace PatternMatch; + ICmpInst::Predicate Pred; + Value *LHS, *RHS; + if (!match(BB->getTerminator(), m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), m_BasicBlock(), m_BasicBlock()))) + continue; + // TODO: Support case with inductive RHS. + auto *LHSS = dyn_cast(SE.getSCEV(LHS)); + if (!LHSS || !LHSS->isAffine() || LHSS->getLoop() != &L) + continue; + auto *RHSS = SE.getSCEV(RHS); + if (!SE.isAvailableAtLoopEntry(RHSS, &L)) + continue; + const SCEV *Step = LHSS->getOperand(1); + const SCEV *Next = SE.getAddExpr(LHSS, Step); + auto *Br = cast(BB->getTerminator()); + if (!SE.isImpliedCond(Pred, LHSS, RHSS, Pred, Next, RHSS, Br)) + continue; + Checks.push_back(InductiveCheck(Br, Pred, LHSS, RHSS)); + } + return !Checks.empty(); +} + +static void cloneLoop(SmallVectorImpl &Checks, Loop &L, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache &AC, const TargetTransformInfo &TTI) { + SmallVector NewBlocks; + DenseMap BlockMap; + ValueToValueMapTy VMap; + + LoopBlocksRPO RPOT(&L); + RPOT.perform(&LI); + for (auto *BB : RPOT) { + auto *NewBB = CloneBasicBlock(BB, VMap, ".fallback", BB->getParent()); + BlockMap[BB] = NewBB; + NewBlocks.push_back(NewBB); + } + + // Add new inputs to exit blocks. + SmallVector ExitBlocks; + L.getExitBlocks(ExitBlocks); + for (auto *ExitBB : ExitBlocks) { + SmallVector Preds; + for (auto *Pred : predecessors(ExitBB)) + Preds.push_back(Pred); + for (auto *Pred : Preds) { + if (!L.contains(Pred)) + continue; + for (auto &Insn : *ExitBB) { + auto *PN = dyn_cast(&Insn); + if (!PN) break; + Value *V = PN->getIncomingValueForBlock(Pred); + if (VMap.count(V)) + V = VMap[V]; + PN->addIncoming(V, BlockMap[Pred]); + } + } + } + + // Fixup in-loop blocks. + for (auto *BB : NewBlocks) { + auto *BI = cast(BB->getTerminator()); + for (size_t I = 0; I < BI->getNumSuccessors(); I++) { + auto *Succ = BI->getSuccessor(I); + if (L.contains(Succ)) + // For loop blocks, bind the successors. + BI->setSuccessor(I, BlockMap[Succ]); + } + + for (auto &Insn : *BB) + for (size_t I = 0; I < Insn.getNumOperands(); I++) + if (VMap.count(Insn.getOperand(I))) + Insn.setOperand(I, VMap[Insn.getOperand(I)]); + } + + // Channel the preheader. + auto *Preheader = L.getLoopPreheader(); + Function *F = Preheader->getParent(); + auto &Context = Preheader->getContext(); + BasicBlock *NewPreheader = BasicBlock::Create(Context, "preheader.fallback", F); + IRBuilder<> Builder(NewPreheader); + BasicBlock *NewHeader = BlockMap[L.getHeader()]; + + for (auto &Check : Checks) { + auto *Br = Check.Br; + size_t ExitIndex = L.contains(Br->getSuccessor(0)) ? 1 : 0; + Br->setSuccessor(ExitIndex, NewPreheader); + } + for (auto &Insn : *L.getHeader()) { + auto *PN = dyn_cast(&Insn); + if (!PN) break; + auto *NewPN = cast(VMap[PN]); + for (size_t I = 0; I < NewPN->getNumIncomingValues(); I++) + if (NewPN->getIncomingBlock(I) == Preheader) { + auto *LCSSA = Builder.CreatePHI(PN->getType(), Checks.size(), PN->getName() + ".lcssa"); + for (auto &Check : Checks) { + auto *Br = Check.Br; + LCSSA->addIncoming(PN, Br->getParent()); + } + NewPN->setIncomingValue(I, LCSSA); + NewPN->setIncomingBlock(I, NewPreheader); + break; + } + } + + Builder.CreateBr(NewHeader); + DT.recalculate(*F); + assert(DT.verify() && "Bad dom tree!"); + assert(L.isLCSSAForm(DT) && "LCSSA form broken"); +} + +static bool removeChecks(Loop &L, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache &AC, const TargetTransformInfo &TTI) { + if (skipLoop(L, SE)) + return false; + SmallVector Checks; + if (!collectCandidates(Checks, L, SE, DT)) + return false; + cloneLoop(Checks, L, LI, SE, DT, AC, TTI); + + const size_t UnrollFactor = 4; + UnrollLoopOptions ULO; + ULO.Count = UnrollFactor; + ULO.TripCount = 0; + ULO.Force = true; + ULO.AllowRuntime = true; + ULO.AllowExpensiveTripCount = true; + ULO.PreserveCondBr = true; + ULO.PreserveOnlyFirst = false; + ULO.TripMultiple = 1; + ULO.PeelCount = 0; + ULO.UnrollRemainder = false; + ULO.ForgetAllSCEV = true; + +// UnrollLoop(&L, ULO, &LI, &SE, &DT, &AC, &TTI, /*OptimizationRemarkEmitter*/ nullptr, /*PreserveLCSSA*/ true); + + return false; +} + +PreservedAnalyses InductiveUnrollingPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &LU) { + if (removeChecks(L, AR.LI, AR.SE, AR.DT, AR.AC, AR.TTI)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} Index: llvm/test/Transforms/InductiveUnroll/motivation.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InductiveUnroll/motivation.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=inductive-unroll %s | FileCheck %s + +target datalayout = "n8:16:32:64" + +define i32 @test_01(i32 %end, i32 %len) { +; CHECK-LABEL: @test_01( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[RC:%.*]] = icmp slt i32 [[IV]], [[LEN:%.*]] +; CHECK-NEXT: br i1 [[RC]], label [[BACKEDGE]], label [[PREHEADER_FALLBACK:%.*]] +; CHECK: backedge: +; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-NEXT: [[END_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[END:%.*]] +; CHECK-NEXT: br i1 [[END_COND]], label [[LOOP]], label [[DONE:%.*]] +; CHECK: done: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ], [ [[IV_NEXT_FALLBACK:%.*]], [[BACKEDGE_FALLBACK:%.*]] ] +; CHECK-NEXT: ret i32 [[IV_NEXT_LCSSA]] +; CHECK: failure: +; CHECK-NEXT: unreachable +; CHECK: loop.fallback: +; CHECK-NEXT: [[IV_FALLBACK:%.*]] = phi i32 [ [[IV_LCSSA:%.*]], [[PREHEADER_FALLBACK]] ], [ [[IV_NEXT_FALLBACK]], [[BACKEDGE]] ] +; CHECK-NEXT: [[RC_FALLBACK:%.*]] = icmp slt i32 [[IV_FALLBACK]], [[LEN]] +; CHECK-NEXT: br i1 [[RC_FALLBACK]], label [[BACKEDGE_FALLBACK]], label [[FAILURE:%.*]] +; CHECK: backedge.fallback: +; CHECK-NEXT: [[IV_NEXT_FALLBACK]] = add nsw i32 [[IV_FALLBACK]], 1 +; CHECK-NEXT: [[END_COND_FALLBACK:%.*]] = icmp slt i32 [[IV_NEXT_FALLBACK]], [[END]] +; CHECK-NEXT: br i1 [[END_COND_FALLBACK]], label [[LOOP_FALLBACK:%.*]], label [[DONE]] +; CHECK: preheader.fallback: +; CHECK-NEXT: [[IV_LCSSA]] = phi i32 [ [[IV]], [[LOOP]] ] +; CHECK-NEXT: br label [[LOOP_FALLBACK]] +; +entry: + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] + %rc = icmp slt i32 %iv, %len + br i1 %rc, label %backedge, label %failure + +backedge: ; preds = %loop + %iv.next = add nsw i32 %iv, 1 + %end.cond = icmp slt i32 %iv.next, %end + br i1 %end.cond, label %loop, label %done + +done: ; preds = %backedge + %iv.next.lcssa = phi i32 [ %iv.next, %backedge ] + ret i32 %iv.next.lcssa + +failure: ; preds = %loop + unreachable +} + +define i32 @test_02(i32 %end, i32 %len, i32* %p) { +; CHECK-LABEL: @test_02( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[BACKEDGE]] ] +; CHECK-NEXT: [[RC:%.*]] = icmp slt i32 [[IV]], [[LEN:%.*]] +; CHECK-NEXT: br i1 [[RC]], label [[BACKEDGE]], label [[PREHEADER_FALLBACK:%.*]] +; CHECK: backedge: +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i32 [[IV]] +; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[SUM_NEXT]] = add i32 [[SUM]], [[X]] +; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-NEXT: [[END_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[END:%.*]] +; CHECK-NEXT: br i1 [[END_COND]], label [[LOOP]], label [[DONE:%.*]] +; CHECK: done: +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ], [ [[SUM_NEXT_FALLBACK:%.*]], [[BACKEDGE_FALLBACK:%.*]] ] +; CHECK-NEXT: ret i32 [[SUM_NEXT_LCSSA]] +; CHECK: failure: +; CHECK-NEXT: [[SUM_LCSSA:%.*]] = phi i32 [ [[SUM]], [[LOOP]] ], [ [[SUM_FALLBACK:%.*]], [[LOOP_FALLBACK:%.*]] ] +; CHECK-NEXT: ret i32 [[SUM_LCSSA]] +; CHECK: loop.fallback: +; CHECK-NEXT: [[IV_FALLBACK:%.*]] = phi i32 [ [[IV_LCSSA:%.*]], [[PREHEADER_FALLBACK]] ], [ [[IV_NEXT_FALLBACK:%.*]], [[BACKEDGE]] ] +; CHECK-NEXT: [[SUM_FALLBACK]] = phi i32 [ [[SUM_LCSSA1:%.*]], [[PREHEADER_FALLBACK]] ], [ [[SUM_NEXT_FALLBACK]], [[BACKEDGE]] ] +; CHECK-NEXT: [[RC_FALLBACK:%.*]] = icmp slt i32 [[IV_FALLBACK]], [[LEN]] +; CHECK-NEXT: br i1 [[RC_FALLBACK]], label [[BACKEDGE_FALLBACK]], label [[FAILURE:%.*]] +; CHECK: backedge.fallback: +; CHECK-NEXT: [[ADDR_FALLBACK:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IV_FALLBACK]] +; CHECK-NEXT: [[X_FALLBACK:%.*]] = load i32, i32* [[ADDR_FALLBACK]], align 4 +; CHECK-NEXT: [[SUM_NEXT_FALLBACK]] = add i32 [[SUM_FALLBACK]], [[X_FALLBACK]] +; CHECK-NEXT: [[IV_NEXT_FALLBACK]] = add nsw i32 [[IV_FALLBACK]], 1 +; CHECK-NEXT: [[END_COND_FALLBACK:%.*]] = icmp slt i32 [[IV_NEXT_FALLBACK]], [[END]] +; CHECK-NEXT: br i1 [[END_COND_FALLBACK]], label [[LOOP_FALLBACK]], label [[DONE]] +; CHECK: preheader.fallback: +; CHECK-NEXT: [[IV_LCSSA]] = phi i32 [ [[IV]], [[LOOP]] ] +; CHECK-NEXT: [[SUM_LCSSA1]] = phi i32 [ [[SUM]], [[LOOP]] ] +; CHECK-NEXT: br label [[LOOP_FALLBACK]] +; +entry: + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] + %sum = phi i32 [0, %entry], [%sum.next, %backedge] + %rc = icmp slt i32 %iv, %len + br i1 %rc, label %backedge, label %failure + +backedge: ; preds = %loop + %addr = getelementptr inbounds i32, i32* %p, i32 %iv + %x = load i32, i32* %addr + %sum.next = add i32 %sum, %x + %iv.next = add nsw i32 %iv, 1 + %end.cond = icmp slt i32 %iv.next, %end + br i1 %end.cond, label %loop, label %done + +done: ; preds = %backedge + %sum.next.lcssa = phi i32 [%sum.next, %backedge ] + ret i32 %sum.next.lcssa + +failure: ; preds = %loop + %sum.lcssa = phi i32 [%sum, %loop ] + ret i32 %sum.lcssa +}