diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -140,6 +140,13 @@ cl::desc("Narrow LSR complex solution using" " expectation of registers number")); +// Flag to narrow search space by filtering non-optimal formulae with +// the same ScaledReg and Scale. +static cl::opt FilterSameScaledReg( + "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), + cl::desc("Narrow LSR search space by filtering non-optimal formulae" + " with the same ScaledReg and Scale")); + #ifndef NDEBUG // Stress test IV chain generation. static cl::opt StressIVChain( @@ -1902,6 +1909,7 @@ void NarrowSearchSpaceByDetectingSupersets(); void NarrowSearchSpaceByCollapsingUnrolledCode(); void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + void NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); void NarrowSearchSpaceByDeletingCostlyFormulas(); void NarrowSearchSpaceByPickingWinnerRegs(); void NarrowSearchSpaceUsingHeuristics(); @@ -4306,6 +4314,104 @@ } } +/// If a LSRUse has multiple formulae with the same ScaledReg and Scale. +/// Pick the best one and delete the others. +/// This narrowing heuristic is to keep as many formulae with different +/// Scale and ScaledReg pair as possible while narrowing the search space. +/// The benefit is that it is more likely to find out a better solution +/// from a formulae set with more Scale and ScaledReg variations than +/// a formulae set with the same Scale and ScaledReg. The picking winner +/// reg heurstic will often keep the formulae with the same Scale and +/// ScaledReg and filter others, and we want to avoid that if possible. +void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { + if (EstimateSearchSpaceComplexity() < ComplexityLimit) + return; + + DEBUG(dbgs() << "The search space is too complex.\n" + "Narrowing the search space by choosing the best Formula " + "from the Formulae with the same Scale and ScaledReg.\n"); + + // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse. + typedef DenseMap, size_t> BestFormulaeTy; + BestFormulaeTy BestFormulae; +#ifndef NDEBUG + bool ChangedFormulae = false; +#endif + DenseSet VisitedRegs; + SmallPtrSet Regs; + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); + + // Return true if Formula FA is better than Formula FB. + auto IsBetterThan = [&](Formula &FA, Formula &FB) { + // First we will try to choose the Formula with fewer new registers. + // For a register used by current Formula, the more the register is + // shared among LSRUses, the less we increase the register number + // counter of the formula. + size_t FARegNum = 0; + for (const SCEV *Reg : FA.BaseRegs) { + const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg); + FARegNum += (NumUses - UsedByIndices.count() + 1); + } + size_t FBRegNum = 0; + for (const SCEV *Reg : FB.BaseRegs) { + const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg); + FBRegNum += (NumUses - UsedByIndices.count() + 1); + } + if (FARegNum != FBRegNum) + return FARegNum < FBRegNum; + + // If the new register numbers are the same, choose the Formula with + // less Cost. + Cost CostFA, CostFB; + Regs.clear(); + CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU); + Regs.clear(); + CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU); + return CostFA.isLess(CostFB, TTI); + }; + + bool Any = false; + for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms; + ++FIdx) { + Formula &F = LU.Formulae[FIdx]; + if (!F.ScaledReg) + continue; + auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx}); + if (P.second) + continue; + + Formula &Best = LU.Formulae[P.first->second]; + if (IsBetterThan(F, Best)) + std::swap(F, Best); + DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); + dbgs() << "\n" + " in favor of formula "; + Best.print(dbgs()); dbgs() << '\n'); +#ifndef NDEBUG + ChangedFormulae = true; +#endif + LU.DeleteFormula(F); + --FIdx; + --NumForms; + Any = true; + } + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + + // Reset this to prepare for the next use. + BestFormulae.clear(); + } + + DEBUG(if (ChangedFormulae) { + dbgs() << "\n" + "After filtering out undesirable candidates:\n"; + print_uses(dbgs()); + }); +} + /// The function delete formulas with high registers number expectation. /// Assuming we don't know the value of each formula (already delete /// all inefficient), generate probability of not selecting for each @@ -4516,6 +4622,8 @@ NarrowSearchSpaceByDetectingSupersets(); NarrowSearchSpaceByCollapsingUnrolledCode(); NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + if (FilterSameScaledReg) + NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); if (LSRExpNarrow) NarrowSearchSpaceByDeletingCostlyFormulas(); else diff --git a/llvm/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll b/llvm/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll --- a/llvm/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll +++ b/llvm/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s +; RUN: llc -lsr-filter-same-scaled-reg=false < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s ; Test case for the recoloring of broken hints. ; This is tricky to have something reasonably small to kick this optimization since ; it requires that spliting and spilling occur. diff --git a/llvm/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll b/llvm/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll --- a/llvm/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll @@ -14,8 +14,8 @@ ; current LSR cost model. ; CHECK-NOT: = ptrtoint i8* undef to i64 ; CHECK: .lr.ph -; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp5, 1 -; CHECK: sub i64 [[TMP]], %tmp6 +; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp{{[0-9]+}}, -1 +; CHECK: sub i64 [[TMP]], %tmp{{[0-9]+}} ; CHECK: ret void define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 { bb: diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll @@ -0,0 +1,60 @@ +; RUN: opt < %s -loop-reduce -lsr-filter-same-scaled-reg=true -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%struct.ham = type { i8, i8, [5 x i32], i64, i64, i64 } + +@global = external local_unnamed_addr global %struct.ham, align 8 + +define void @foo() local_unnamed_addr { +bb: + %tmp = load i64, i64* getelementptr inbounds (%struct.ham, %struct.ham* @global, i64 0, i32 3), align 8 + %tmp1 = and i64 %tmp, 1792 + %tmp2 = load i64, i64* getelementptr inbounds (%struct.ham, %struct.ham* @global, i64 0, i32 4), align 8 + %tmp3 = add i64 %tmp1, %tmp2 + %tmp4 = load i8*, i8** null, align 8 + %tmp5 = getelementptr inbounds i8, i8* %tmp4, i64 0 + %tmp6 = sub i64 0, %tmp3 + %tmp7 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp6 + %tmp8 = inttoptr i64 0 to i8* + br label %bb9 + +; Without filtering non-optimal formulae with the same ScaledReg and Scale, the strategy +; to narrow LSR search space by picking winner reg will generate only one lsr.iv and +; unoptimal result. +; CHECK-LABEL: @foo( +; CHECK: bb9: +; CHECK-NEXT: = phi i8* +; CHECK-NEXT: = phi i8* + +bb9: ; preds = %bb12, %bb + %tmp10 = phi i8* [ %tmp7, %bb ], [ %tmp16, %bb12 ] + %tmp11 = phi i8* [ %tmp8, %bb ], [ %tmp17, %bb12 ] + br i1 false, label %bb18, label %bb12 + +bb12: ; preds = %bb9 + %tmp13 = getelementptr inbounds i8, i8* %tmp10, i64 8 + %tmp14 = bitcast i8* %tmp13 to i64* + %tmp15 = load i64, i64* %tmp14, align 1 + %tmp16 = getelementptr inbounds i8, i8* %tmp10, i64 16 + %tmp17 = getelementptr inbounds i8, i8* %tmp11, i64 16 + br label %bb9 + +bb18: ; preds = %bb9 + %tmp19 = icmp ugt i8* %tmp11, null + %tmp20 = getelementptr inbounds i8, i8* %tmp10, i64 8 + %tmp21 = getelementptr inbounds i8, i8* %tmp11, i64 8 + %tmp22 = select i1 %tmp19, i8* %tmp10, i8* %tmp20 + %tmp23 = select i1 %tmp19, i8* %tmp11, i8* %tmp21 + br label %bb24 + +bb24: ; preds = %bb24, %bb18 + %tmp25 = phi i8* [ %tmp27, %bb24 ], [ %tmp22, %bb18 ] + %tmp26 = phi i8* [ %tmp29, %bb24 ], [ %tmp23, %bb18 ] + %tmp27 = getelementptr inbounds i8, i8* %tmp25, i64 1 + %tmp28 = load i8, i8* %tmp25, align 1 + %tmp29 = getelementptr inbounds i8, i8* %tmp26, i64 1 + store i8 %tmp28, i8* %tmp26, align 1 + %tmp30 = icmp eq i8* %tmp29, %tmp5 + br label %bb24 +}