diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -64,6 +64,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/IVUsers.h" @@ -186,6 +187,13 @@ "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost")); +static cl::opt AllowTerminatingConditionFoldingAfterLSR( + "lsr-term-fold", cl::Hidden, cl::init(false), + cl::desc("Attempt to replace primary IV with other IV.")); + +STATISTIC(NumTermFold, + "Number of terminating condition fold recognized and performed"); + #ifndef NDEBUG // Stress test IV chain generation. static cl::opt StressIVChain( @@ -6572,6 +6580,135 @@ return nullptr; } +static Optional> +canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT, + const LoopInfo &LI) { + if (!L->isInnermost()) { + LLVM_DEBUG(dbgs() << "Cannot fold on non-innermost loop\n"); + return None; + } + // Only inspect on simple loop structure + if (!L->isLoopSimplifyForm()) { + LLVM_DEBUG(dbgs() << "Cannot fold on non-simple loop\n"); + return None; + } + + if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { + LLVM_DEBUG(dbgs() << "Cannot fold on backedge that is loop variant\n"); + return None; + } + + BasicBlock *LoopPreheader = L->getLoopPreheader(); + BasicBlock *LoopLatch = L->getLoopLatch(); + + // TODO: Can we do something for greater than and less than? + // Terminating condition is foldable when it is an eq/ne icmp + BranchInst *BI = cast(LoopLatch->getTerminator()); + if (BI->isUnconditional()) + return None; + Value *TermCond = BI->getCondition(); + if (!isa(TermCond) || !cast(TermCond)->isEquality()) { + LLVM_DEBUG(dbgs() << "Cannot fold on branching condition that is not an " + "ICmpInst::eq / ICmpInst::ne\n"); + return None; + } + + // For `IsToFold`, a primary IV can be replaced by other affine AddRec when it + // is only used by the terminating condition. To check for this, we may need + // to traverse through a chain of use-def until we can examine the final + // usage. + // *----------------------* + // *---->| LoopHeader: | + // | | PrimaryIV = phi ... | + // | *----------------------* + // | | + // | | + // | chain of + // | single use + // used by | + // phi | + // | Value + // | / \ + // | chain of chain of + // | single use single use + // | / \ + // | / \ + // *- Value Value --> used by terminating condition + auto IsToFold = [&](PHINode &PN) -> bool { + Value *V = &PN; + + while (V->getNumUses() == 1) + V = *V->user_begin(); + + if (V->getNumUses() != 2) + return false; + + Value *VToPN = nullptr; + Value *VToTermCond = nullptr; + for (User *U : V->users()) { + while (U->getNumUses() == 1) { + if (isa(U)) + VToPN = U; + if (U == TermCond) + VToTermCond = U; + U = *U->user_begin(); + } + } + return VToPN && VToTermCond; + }; + + // For `IsToHelpFold`, other IV that is an affine AddRec will be sufficient to + // replace the terminating condition + auto IsToHelpFold = [&](PHINode &PN) -> bool { + // TODO: Right now we limit the phi node to help the folding be of a start + // value of getelementptr. We can extend to any kinds of IV as long as it is + // an affine AddRec. Add a switch to cover more types of instructions here + // and down in the actual transformation. + return isa(PN.getIncomingValueForBlock(LoopPreheader)); + }; + + PHINode *ToFold = nullptr; + PHINode *ToHelpFold = nullptr; + + for (PHINode &PN : L->getHeader()->phis()) { + if (!SE.isSCEVable(PN.getType())) { + LLVM_DEBUG(dbgs() << "IV of phi '" << PN + << "' is not SCEV-able, not qualified for the " + "terminating condition folding.\n"); + continue; + } + const SCEV *S = SE.getSCEV(&PN); + const SCEVAddRecExpr *AddRec = dyn_cast(S); + // Only speculate on affine AddRec + if (!AddRec || !AddRec->isAffine()) { + LLVM_DEBUG(dbgs() << "SCEV of phi '" << PN + << "' is not an affine add recursion, not qualified " + "for the terminating condition folding.\n"); + continue; + } + + if (IsToFold(PN)) + ToFold = &PN; + else if (IsToHelpFold(PN)) + ToHelpFold = &PN; + } + + LLVM_DEBUG(if (ToFold && !ToHelpFold) dbgs() + << "Cannot find other AddRec IV to help folding\n";); + + LLVM_DEBUG(if (ToFold && ToHelpFold) dbgs() + << "\nFound loop that can fold terminating condition\n" + << " BECount (SCEV): " << *SE.getBackedgeTakenCount(L) << "\n" + << " TermCond: " << *TermCond << "\n" + << " BrandInst: " << *BI << "\n" + << " ToFold: " << *ToFold << "\n" + << " ToHelpFold: " << *ToHelpFold << "\n"); + + if (!ToFold || !ToHelpFold) + return None; + return {{ToFold, ToHelpFold}}; +} + static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, @@ -6630,6 +6767,93 @@ } } + if (AllowTerminatingConditionFoldingAfterLSR) { + auto CanFoldTerminatingCondition = canFoldTermCondOfLoop(L, SE, DT, LI); + if (CanFoldTerminatingCondition) { + BasicBlock *LoopPreheader = L->getLoopPreheader(); + BasicBlock *LoopLatch = L->getLoopLatch(); + + PHINode *ToFold = CanFoldTerminatingCondition->first; + PHINode *ToHelpFold = CanFoldTerminatingCondition->second; + + LLVM_DEBUG(dbgs() << "To fold phi-node:\n" + << *ToFold << "\n" + << "New term-cond phi-node:\n" + << *ToHelpFold << "\n"); + + Value *StartValue = ToHelpFold->getIncomingValueForBlock(LoopPreheader); + Value *LoopValue = ToHelpFold->getIncomingValueForBlock(LoopLatch); + + // SCEVExpander for both use in preheader and latch + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + SCEVExpander Expander(SE, DL, "lsr_fold_term_cond"); + SCEVExpanderCleaner ExpCleaner(Expander); + + // Create new terminating value at loop header + GetElementPtrInst *StartValueGEP = cast(StartValue); + Type *PtrTy = StartValueGEP->getPointerOperand()->getType(); + + const SCEV *BECount = SE.getBackedgeTakenCount(L); + const SCEVAddRecExpr *AddRec = + cast(SE.getSCEV(ToHelpFold)); + + // TermValue = Start + Stride * (BackedgeCount + 1) + const SCEV *TermValueS = SE.getAddExpr( + AddRec->getOperand(0), + SE.getTruncateOrZeroExtend( + SE.getMulExpr( + AddRec->getOperand(1), + SE.getTruncateOrZeroExtend( + SE.getAddExpr(BECount, SE.getOne(BECount->getType())), + AddRec->getOperand(1)->getType())), + AddRec->getOperand(0)->getType())); + + // NOTE: If this is triggered, we should add this into predicate + if (!Expander.isSafeToExpand(TermValueS)) { + LLVMContext &Ctx = L->getHeader()->getContext(); + Ctx.emitError( + "Terminating value is not safe to expand, need to add it to " + "predicate"); + } else { // Now we replace the condition with ToHelpFold and remove ToFold + Changed = true; + NumTermFold++; + + Value *TermValue = Expander.expandCodeFor( + TermValueS, PtrTy, LoopPreheader->getTerminator()); + + LLVM_DEBUG(dbgs() << "Start value of new term-cond phi-node:\n" + << *StartValue << "\n" + << "Terminating value of new term-cond phi-node:\n" + << *TermValue << "\n"); + + // Create new terminating condition at loop latch + BranchInst *BI = cast(LoopLatch->getTerminator()); + ICmpInst *OldTermCond = cast(BI->getCondition()); + IRBuilder<> LatchBuilder(LoopLatch->getTerminator()); + Value *NewTermCond = LatchBuilder.CreateICmp( + OldTermCond->getPredicate(), LoopValue, TermValue, + "lsr_fold_term_cond.replaced_term_cond"); + + LLVM_DEBUG(dbgs() << "Old term-cond:\n" + << *OldTermCond << "\n" + << "New term-cond:\b" << *NewTermCond << "\n"); + + BI->setCondition(NewTermCond); + + OldTermCond->replaceAllUsesWith( + PoisonValue::get(OldTermCond->getType())); + OldTermCond->eraseFromParent(); + + // Cleanup the old terminating condition that is no longer used + // Clear the PHINode and DCE will do the rest... + while (ToFold->getNumIncomingValues()) + ToFold->removeIncomingValue(0u); + } + + ExpCleaner.markResultUsed(); + } + } + if (SalvageableDVIRecords.empty()) return Changed; diff --git a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold-negative-testcase.ll b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold-negative-testcase.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold-negative-testcase.ll @@ -0,0 +1,140 @@ +; REQUIRES: asserts +; RUN: opt < %s -passes="loop-reduce" -S -debug -lsr-term-fold 2>&1 | FileCheck %s + +target datalayout = "e-p:32:32:32-n32" + +define i32 @loop_variant(ptr %ar, i32 %n, i32 %m) { +; CHECK: Cannot fold on backedge that is loop variant +entry: + br label %for.cond + +for.cond: ; preds = %for.cond, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %mul, %for.cond ] + %cmp = icmp slt i32 %n.addr.0, %m + %mul = shl nsw i32 %n.addr.0, 1 + br i1 %cmp, label %for.cond, label %for.end + +for.end: ; preds = %for.cond + ret i32 %n.addr.0 +} + +define i32 @nested_loop(ptr %ar, i32 %n, i32 %m, i32 %o) { +; CHECK: Cannot fold on backedge that is loop variant +; CHECK: Cannot fold on non-innermost loop +entry: + %cmp15 = icmp sgt i32 %o, 0 + br i1 %cmp15, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + %cnt.0.lcssa = phi i32 [ 0, %entry ], [ %cnt.1.lcssa, %for.cond.cleanup3 ] + ret i32 %cnt.0.lcssa + +for.body: ; preds = %entry, %for.cond.cleanup3 + %i.017 = phi i32 [ %inc6, %for.cond.cleanup3 ], [ 0, %entry ] + %cnt.016 = phi i32 [ %cnt.1.lcssa, %for.cond.cleanup3 ], [ 0, %entry ] + %sub = sub nsw i32 %n, %i.017 + %cmp212 = icmp slt i32 %sub, %m + br i1 %cmp212, label %for.body4, label %for.cond.cleanup3 + +for.cond.cleanup3: ; preds = %for.body4, %for.body + %cnt.1.lcssa = phi i32 [ %cnt.016, %for.body ], [ %inc, %for.body4 ] + %inc6 = add nuw nsw i32 %i.017, 1 + %cmp = icmp slt i32 %inc6, %o + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body4: ; preds = %for.body, %for.body4 + %j.014 = phi i32 [ %mul, %for.body4 ], [ %sub, %for.body ] + %cnt.113 = phi i32 [ %inc, %for.body4 ], [ %cnt.016, %for.body ] + %inc = add nsw i32 %cnt.113, 1 + %mul = shl nsw i32 %j.014, 1 + %cmp2 = icmp slt i32 %mul, %m + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 +} + +; The terminating condition folding transformation cannot find the ptr IV +; because it checks if the value comes from the LoopPreheader. %mark is from +; the function argument, so it is not qualified for the transformation. +define void @no_iv_to_help(ptr %mark, i32 signext %length) { +; CHECK: Cannot find other AddRec IV to help folding +entry: + %tobool.not3 = icmp eq i32 %length, 0 + br i1 %tobool.not3, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.05 = phi i32 [ %dec, %for.body ], [ %length, %entry ] + %dst.04 = phi ptr [ %incdec.ptr, %for.body ], [ %mark, %entry ] + %0 = load ptr, ptr %dst.04, align 8 + call ptr @foo(ptr %0) + %incdec.ptr = getelementptr inbounds ptr, ptr %dst.04, i64 1 + %dec = add nsw i32 %i.05, -1 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %for.cond.cleanup, label %for.body +} + +declare void @foo(ptr) + +define void @NonAddRecIV(ptr %a) { +; CHECK: SCEV of phi ' %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 1, %entry ]' +; CHECK: is not an affine add recursion, not qualified for the terminating condition folding. +entry: + %uglygep = getelementptr i8, ptr %a, i32 84 + br label %for.body + +for.body: ; preds = %for.body, %entry + %lsr.iv1 = phi ptr [ %uglygep2, %for.body ], [ %uglygep, %entry ] + %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 1, %entry ] + store i32 1, ptr %lsr.iv1, align 4 + %lsr.iv.next = mul nsw i32 %lsr.iv, 2 + %uglygep2 = getelementptr i8, ptr %lsr.iv1, i64 4 + %exitcond.not = icmp eq i32 %lsr.iv.next, 65536 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +@fp_inc = common global float 0.000000e+00, align 4 + +define void @NonSCEVableIV(float %init, float* %A, i32 %N) { +; CHECK: IV of phi ' %x.05 = phi float [ %init, %entry ], [ %add, %for.body ]' +; CHECK: is not SCEV-able, not qualified for the terminating condition folding. +entry: + %0 = load float, float* @fp_inc, align 4 + br label %for.body + +for.body: ; preds = %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %x.05 = phi float [ %init, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %x.05, float* %arrayidx, align 4 + %add = fsub float %x.05, %0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.end + ret void +} + +define void @NonIcmpEqNe(ptr %a) { +; CHECK: Cannot fold on branching condition that is not an ICmpInst::eq / ICmpInst::ne +entry: + %uglygep = getelementptr i8, ptr %a, i64 84 + br label %for.body + +for.body: ; preds = %for.body, %entry + %lsr.iv1 = phi ptr [ %uglygep2, %for.body ], [ %uglygep, %entry ] + %lsr.iv = phi i64 [ %lsr.iv.next, %for.body ], [ 379, %entry ] + store i32 1, ptr %lsr.iv1, align 4 + %lsr.iv.next = add nsw i64 %lsr.iv, -1 + %uglygep2 = getelementptr i8, ptr %lsr.iv1, i64 4 + %exitcond.not = icmp sle i64 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} diff --git a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll --- a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes="loop-reduce" -S | FileCheck %s +; RUN: opt < %s -passes="loop-reduce" -S -lsr-term-fold | FileCheck %s ; There are 3 test cases here regarding the replacing the primary iv with ; other affine AddRec IV. @@ -20,15 +20,15 @@ ; CHECK-LABEL: @const_tripcount( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 84 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i32 1600 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[UGLYGEP]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 379, [[ENTRY]] ] ; CHECK-NEXT: store i32 1, ptr [[LSR_IV1]], align 4 -; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], -1 +; CHECK-NEXT: [[LSR_IV_NEXT:%.*]] = add nsw i64 poison, -1 ; CHECK-NEXT: [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[UGLYGEP1]] +; CHECK-NEXT: br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -59,15 +59,17 @@ ; CHECK-LABEL: @runtime_tripcount( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 84 +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[N:%.*]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 84 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP1]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[UGLYGEP]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N:%.*]], [[ENTRY]] ] ; CHECK-NEXT: store i32 1, ptr [[LSR_IV1]], align 4 -; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], -1 +; CHECK-NEXT: [[LSR_IV_NEXT:%.*]] = add nsw i32 poison, -1 ; CHECK-NEXT: [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[UGLYGEP1]] +; CHECK-NEXT: br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -100,17 +102,18 @@ define void @ptr_of_ptr_addrec(ptr %ptrptr, i32 %length) { ; CHECK-LABEL: @ptr_of_ptr_addrec( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[START_PTRPTR:%.*]] = getelementptr inbounds ptr, ptr [[PTRPTR:%.*]] +; CHECK-NEXT: [[START_PTRPTR:%.*]] = getelementptr ptr, ptr [[PTRPTR:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[LENGTH:%.*]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[START_PTRPTR]], i32 [[TMP0]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_05:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_BODY]] ], [ [[LENGTH:%.*]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IT_04:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[START_PTRPTR]], [[ENTRY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[IT_04]], align 8 -; CHECK-NEXT: tail call void @foo(ptr [[TMP0]]) +; CHECK-NEXT: [[IT_04:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[START_PTRPTR]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[IT_04]], align 8 +; CHECK-NEXT: tail call void @foo(ptr [[TMP1]]) ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds ptr, ptr [[IT_04]], i64 1 -; CHECK-NEXT: [[DEC]] = add i32 [[I_05]], -1 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: [[DEC:%.*]] = add i32 poison, -1 +; CHECK-NEXT: [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[UGLYGEP]] +; CHECK-NEXT: br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret void ;