Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -140,6 +140,13 @@ cl::desc("Narrow LSR complex solution using" " expectation of registers number")); +// Flag to narrow search space by filtering non-optimal formulae with +// the same ScaledReg and Scale. +static cl::opt FilterSameScaledReg( + "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), + cl::desc("Narrow LSR search space by filtering non-optimal formulae" + " with the same ScaledReg and Scale")); + #ifndef NDEBUG // Stress test IV chain generation. static cl::opt StressIVChain( @@ -1902,6 +1909,7 @@ void NarrowSearchSpaceByDetectingSupersets(); void NarrowSearchSpaceByCollapsingUnrolledCode(); void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + void NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); void NarrowSearchSpaceByDeletingCostlyFormulas(); void NarrowSearchSpaceByPickingWinnerRegs(); void NarrowSearchSpaceUsingHeuristics(); @@ -4306,6 +4314,104 @@ } } +/// If a LSRUse has multiple formulae with the same ScaledReg and Scale. +/// Pick the best one and delete the others. +/// This narrowing heuristic is to keep as many formulae with different +/// Scale and ScaledReg pair as possible while narrowing the search space. +/// The benefit is that it is more likely to find out a better solution +/// from a formulae set with more Scale and ScaledReg variations than +/// a formulae set with the same Scale and ScaledReg. The picking winner +/// reg heurstic will often keep the formulae with the same Scale and +/// ScaledReg and filter others, and we want to avoid that if possible. +void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { + if (EstimateSearchSpaceComplexity() < ComplexityLimit) + return; + + DEBUG(dbgs() << "The search space is too complex.\n" + "Narrowing the search space by choosing the best Formula " + "from the Formulae with the same Scale and ScaledReg.\n"); + + // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse. + typedef DenseMap, size_t> BestFormulaeTy; + BestFormulaeTy BestFormulae; +#ifndef NDEBUG + bool ChangedFormulae = false; +#endif + DenseSet VisitedRegs; + SmallPtrSet Regs; + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); + + // Return true if Formula FA is better than Formula FB. + auto IsBetterThan = [&](Formula &FA, Formula &FB) { + // First we will try to choose the Formula with fewer new registers. + // For a register used by current Formula, the more the register is + // shared among LSRUses, the less we increase the register number + // counter of the formula. + size_t FARegNum = 0; + for (const SCEV *Reg : FA.BaseRegs) { + const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg); + FARegNum += (NumUses - UsedByIndices.count() + 1); + } + size_t FBRegNum = 0; + for (const SCEV *Reg : FB.BaseRegs) { + const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg); + FBRegNum += (NumUses - UsedByIndices.count() + 1); + } + if (FARegNum != FBRegNum) + return FARegNum < FBRegNum; + + // If the new register numbers are the same, choose the Formula with + // less Cost. + Cost CostFA, CostFB; + Regs.clear(); + CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU); + Regs.clear(); + CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU); + return CostFA.isLess(CostFB, TTI); + }; + + bool Any = false; + for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms; + ++FIdx) { + Formula &F = LU.Formulae[FIdx]; + if (!F.ScaledReg) + continue; + auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx}); + if (P.second) + continue; + + Formula &Best = LU.Formulae[P.first->second]; + if (IsBetterThan(F, Best)) + std::swap(F, Best); + DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); + dbgs() << "\n" + " in favor of formula "; + Best.print(dbgs()); dbgs() << '\n'); +#ifndef NDEBUG + ChangedFormulae = true; +#endif + LU.DeleteFormula(F); + --FIdx; + --NumForms; + Any = true; + } + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + + // Reset this to prepare for the next use. + BestFormulae.clear(); + } + + DEBUG(if (ChangedFormulae) { + dbgs() << "\n" + "After filtering out undesirable candidates:\n"; + print_uses(dbgs()); + }); +} + /// The function delete formulas with high registers number expectation. /// Assuming we don't know the value of each formula (already delete /// all inefficient), generate probability of not selecting for each @@ -4516,6 +4622,8 @@ NarrowSearchSpaceByDetectingSupersets(); NarrowSearchSpaceByCollapsingUnrolledCode(); NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + if (FilterSameScaledReg) + NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); if (LSRExpNarrow) NarrowSearchSpaceByDeletingCostlyFormulas(); else Index: test/CodeGen/X86/regalloc-reconcile-broken-hints.ll =================================================================== --- test/CodeGen/X86/regalloc-reconcile-broken-hints.ll +++ test/CodeGen/X86/regalloc-reconcile-broken-hints.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s +; RUN: llc -lsr-filter-same-scaled-reg=false < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s ; Test case for the recoloring of broken hints. ; This is tricky to have something reasonably small to kick this optimization since ; it requires that spliting and spilling occur. Index: test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll =================================================================== --- test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll +++ test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll @@ -14,8 +14,8 @@ ; current LSR cost model. ; CHECK-NOT: = ptrtoint i8* undef to i64 ; CHECK: .lr.ph -; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp5, 1 -; CHECK: sub i64 [[TMP]], %tmp6 +; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp{{[0-9]+}}, -1 +; CHECK: sub i64 [[TMP]], %tmp{{[0-9]+}} ; CHECK: ret void define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 { bb: Index: test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll +++ test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll @@ -0,0 +1,163 @@ +; RUN: opt < %s -loop-reduce -lsr-filter-same-scaled-reg=true -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%class.A = type { i8, i8, [5 x i32], i64, i64, i64 } +%class.ZippyScatteredWriter = type { i8, i8*, i8* } +@e = local_unnamed_addr global %class.A { i8 0, i8 0, [5 x i32] zeroinitializer, i64 1, i64 0, i64 1 }, align 8 +@f = local_unnamed_addr global %class.ZippyScatteredWriter* null, align 8 +declare void @_ZN20ZippyScatteredWriter5m_fn2Ev(%class.ZippyScatteredWriter*) local_unnamed_addr + +define void @foo() local_unnamed_addr { +entry: + %tmp = load %class.ZippyScatteredWriter*, %class.ZippyScatteredWriter** @f, align 8 + %op_ptr_.i.i = getelementptr inbounds %class.ZippyScatteredWriter, %class.ZippyScatteredWriter* %tmp, i64 0, i32 1 + %sub.ptr.rhs.cast.i.i = ptrtoint %class.ZippyScatteredWriter* %tmp to i64 + %op_limit_.i.i = getelementptr inbounds %class.ZippyScatteredWriter, %class.ZippyScatteredWriter* %tmp, i64 0, i32 2 + br label %for.cond.i + +for.cond.i: ; preds = %_ZN20ZippyScatteredWriter5m_fn3Emm.exit.i, %entry + %tmp1 = load i64, i64* getelementptr inbounds (%class.A, %class.A* @e, i64 0, i32 3), align 8 + %and.i = and i64 %tmp1, 1792 + %tmp2 = load i64, i64* getelementptr inbounds (%class.A, %class.A* @e, i64 0, i32 4), align 8 + %add.i = add i64 %and.i, %tmp2 + %tmp3 = load i64, i64* getelementptr inbounds (%class.A, %class.A* @e, i64 0, i32 5), align 8 + %tmp4 = load i8*, i8** %op_ptr_.i.i, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %tmp4, i64 %tmp3 + %sub.i.i = add i64 %add.i, -1 + %sub.ptr.lhs.cast.i.i = ptrtoint i8* %tmp4 to i64 + %sub.ptr.sub.i.i = sub i64 %sub.ptr.lhs.cast.i.i, %sub.ptr.rhs.cast.i.i + %cmp.i.i = icmp ult i64 %sub.i.i, %sub.ptr.sub.i.i + br i1 %cmp.i.i, label %land.rhs.i.i, label %_ZN20ZippyScatteredWriter5m_fn3Emm.exit.i + +land.rhs.i.i: ; preds = %for.cond.i + %tmp5 = load i8*, i8** %op_limit_.i.i, align 8 + %cmp3.i.i = icmp ugt i8* %add.ptr.i.i, %tmp5 + br i1 %cmp3.i.i, label %_ZN20ZippyScatteredWriter5m_fn3Emm.exit.i, label %if.then.i.i + +if.then.i.i: ; preds = %land.rhs.i.i + %idx.neg.i.i = sub i64 0, %add.i + %add.ptr5.i.i = getelementptr inbounds i8, i8* %tmp4, i64 %idx.neg.i.i + %sub.ptr.rhs.cast.i.i.i = ptrtoint i8* %add.ptr5.i.i to i64 + %tmp6 = sub i64 0, %sub.ptr.rhs.cast.i.i.i + %add.ptr.i.i.i = inttoptr i64 %tmp6 to i8* + %add.ptr1.i.i.i = getelementptr inbounds i8, i8* %tmp5, i64 -6 + br label %while.cond.i.i.i + +; Without filtering non-optimal formulae with the same ScaledReg and Scale, the strategy +; to narrow LSR search space by picking winner reg will generate one lsr.iv and unoptimal +; result. +; CHECK-LABEL: @foo( +; CHECK: while.cond.i.i.i: +; CHECK-NOT: %lsr.iv = phi i64 +; CHECK-NEXT: %p1.addr.0.i.i.i = phi i8* +; CHECK-NEXT: %b.0.i.i.i = phi i8* + +while.cond.i.i.i: ; preds = %while.body.i.i.i, %if.then.i.i + %p1.addr.0.i.i.i = phi i8* [ %add.ptr5.i.i, %if.then.i.i ], [ %add.ptr4.i.i.i, %while.body.i.i.i ] + %b.0.i.i.i = phi i8* [ %add.ptr.i.i.i, %if.then.i.i ], [ %add.ptr5.i.i.i, %while.body.i.i.i ] + %cmp.i.i.i = icmp ugt i8* %b.0.i.i.i, %add.ptr1.i.i.i + br i1 %cmp.i.i.i, label %while.end.i.i.i, label %while.body.i.i.i + +while.body.i.i.i: ; preds = %while.cond.i.i.i + %a.sroa.0.0..sroa_cast2.i.i.i.i = bitcast i8* %p1.addr.0.i.i.i to i64* + %a.sroa.0.0.copyload.i.i.i.i = load i64, i64* %a.sroa.0.0..sroa_cast2.i.i.i.i, align 1 + %a.sroa.0.0..sroa_cast3.i.i.i.i = bitcast i8* %b.0.i.i.i to i64* + store i64 %a.sroa.0.0.copyload.i.i.i.i, i64* %a.sroa.0.0..sroa_cast3.i.i.i.i, align 1 + %add.ptr2.i.i.i = getelementptr inbounds i8, i8* %p1.addr.0.i.i.i, i64 8 + %add.ptr3.i.i.i = getelementptr inbounds i8, i8* %b.0.i.i.i, i64 8 + %a.sroa.0.0..sroa_cast2.i39.i.i.i = bitcast i8* %add.ptr2.i.i.i to i64* + %a.sroa.0.0.copyload.i40.i.i.i = load i64, i64* %a.sroa.0.0..sroa_cast2.i39.i.i.i, align 1 + %a.sroa.0.0..sroa_cast3.i41.i.i.i = bitcast i8* %add.ptr3.i.i.i to i64* + store i64 %a.sroa.0.0.copyload.i40.i.i.i, i64* %a.sroa.0.0..sroa_cast3.i41.i.i.i, align 1 + %add.ptr4.i.i.i = getelementptr inbounds i8, i8* %p1.addr.0.i.i.i, i64 16 + %add.ptr5.i.i.i = getelementptr inbounds i8, i8* %b.0.i.i.i, i64 16 + %cmp6.i.i.i = icmp ult i8* %add.ptr5.i.i.i, %add.ptr.i.i + br i1 %cmp6.i.i.i, label %while.cond.i.i.i, label %_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit30 + +while.end.i.i.i: ; preds = %while.cond.i.i.i + %add.ptr8.i.i.i = getelementptr inbounds i8, i8* %tmp5, i64 -8 + %cmp9.i.i.i = icmp ugt i8* %b.0.i.i.i, %add.ptr8.i.i.i + br i1 %cmp9.i.i.i, label %if.end16.i.i.i, label %if.then13.i.i.i + +if.then13.i.i.i: ; preds = %while.end.i.i.i + %a.sroa.0.0..sroa_cast2.i42.i.i.i = bitcast i8* %p1.addr.0.i.i.i to i64* + %a.sroa.0.0.copyload.i43.i.i.i = load i64, i64* %a.sroa.0.0..sroa_cast2.i42.i.i.i, align 1 + %a.sroa.0.0..sroa_cast3.i44.i.i.i = bitcast i8* %b.0.i.i.i to i64* + store i64 %a.sroa.0.0.copyload.i43.i.i.i, i64* %a.sroa.0.0..sroa_cast3.i44.i.i.i, align 1 + %add.ptr14.i.i.i = getelementptr inbounds i8, i8* %p1.addr.0.i.i.i, i64 8 + %add.ptr15.i.i.i = getelementptr inbounds i8, i8* %b.0.i.i.i, i64 8 + br label %if.end16.i.i.i + +if.end16.i.i.i: ; preds = %if.then13.i.i.i, %while.end.i.i.i + %p1.addr.1.i.i.i = phi i8* [ %add.ptr14.i.i.i, %if.then13.i.i.i ], [ %p1.addr.0.i.i.i, %while.end.i.i.i ] + %b.1.i.i.i = phi i8* [ %add.ptr15.i.i.i, %if.then13.i.i.i ], [ %b.0.i.i.i, %while.end.i.i.i ] + %b.1.i.i.i13 = ptrtoint i8* %b.1.i.i.i to i64 + %cmp4.i.i.i.i = icmp ult i8* %b.1.i.i.i, %add.ptr.i.i + br i1 %cmp4.i.i.i.i, label %while.body.i.i.i.i.preheader, label %_Z15IncrementalCopyPKcPcS1_.exit.i.i + +while.body.i.i.i.i.preheader: ; preds = %if.end16.i.i.i + %tmp7 = sub i64 %tmp3, %b.1.i.i.i13 + %scevgep = getelementptr i8, i8* %tmp4, i64 %tmp7 + %exitcount.ptrcnt.to.int = ptrtoint i8* %scevgep to i64 + %min.iters.check = icmp ult i8* %scevgep, inttoptr (i64 32 to i8*) + br i1 %min.iters.check, label %while.body.i.i.i.i.preheader29, label %min.iters.checked + +min.iters.checked: ; preds = %while.body.i.i.i.i.preheader + %n.vec = and i64 %exitcount.ptrcnt.to.int, -32 + %cmp.zero = icmp eq i64 %n.vec, 0 + br i1 %cmp.zero, label %while.body.i.i.i.i.preheader29, label %vector.memcheck + +vector.memcheck: ; preds = %min.iters.checked + %tmp8 = sub i64 %tmp3, %b.1.i.i.i13 + %scevgep14 = getelementptr i8, i8* %tmp4, i64 %tmp8 + %scevgep1415 = ptrtoint i8* %scevgep14 to i64 + %scevgep16 = getelementptr i8, i8* %p1.addr.1.i.i.i, i64 %scevgep1415 + %bound0 = icmp ult i8* %b.1.i.i.i, %scevgep16 + %bound1 = icmp ult i8* %p1.addr.1.i.i.i, %add.ptr.i.i + %memcheck.conflict = and i1 %bound0, %bound1 + %ind.end = getelementptr i8, i8* %p1.addr.1.i.i.i, i64 %n.vec + %ind.end18 = getelementptr i8, i8* %b.1.i.i.i, i64 %n.vec + br i1 %memcheck.conflict, label %while.body.i.i.i.i.preheader29, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.memcheck + %tmp9 = add i64 %n.vec, -32 + %tmp10 = lshr exact i64 %tmp9, 5 + %tmp11 = add nuw nsw i64 %tmp10, 1 + %xtraiter = and i64 %tmp11, 3 + %tmp12 = icmp ult i64 %tmp9, 96 + br i1 %tmp12, label %_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit, label %vector.body.preheader.new + +vector.body.preheader.new: ; preds = %vector.body.preheader + %cmp.n = icmp eq i64 %n.vec, %exitcount.ptrcnt.to.int + br i1 %cmp.n, label %_Z15IncrementalCopyPKcPcS1_.exit.i.i, label %while.body.i.i.i.i.preheader29 + +while.body.i.i.i.i.preheader29: ; preds = %vector.body.preheader.new, %vector.memcheck, %min.iters.checked, %while.body.i.i.i.i.preheader + %p1.addr.06.i.i.i.i.ph = phi i8* [ %ind.end, %vector.body.preheader.new ], [ %p1.addr.1.i.i.i, %while.body.i.i.i.i.preheader ], [ %p1.addr.1.i.i.i, %min.iters.checked ], [ %p1.addr.1.i.i.i, %vector.memcheck ] + %p2.addr.05.i.i.i.i.ph = phi i8* [ %ind.end18, %vector.body.preheader.new ], [ %b.1.i.i.i, %while.body.i.i.i.i.preheader ], [ %b.1.i.i.i, %min.iters.checked ], [ %b.1.i.i.i, %vector.memcheck ] + br label %while.body.i.i.i.i + +while.body.i.i.i.i: ; preds = %while.body.i.i.i.i, %while.body.i.i.i.i.preheader29 + %p1.addr.06.i.i.i.i = phi i8* [ %incdec.ptr.i.i.i.i, %while.body.i.i.i.i ], [ %p1.addr.06.i.i.i.i.ph, %while.body.i.i.i.i.preheader29 ] + %p2.addr.05.i.i.i.i = phi i8* [ %incdec.ptr1.i.i.i.i, %while.body.i.i.i.i ], [ %p2.addr.05.i.i.i.i.ph, %while.body.i.i.i.i.preheader29 ] + %incdec.ptr.i.i.i.i = getelementptr inbounds i8, i8* %p1.addr.06.i.i.i.i, i64 1 + %t43 = load i8, i8* %p1.addr.06.i.i.i.i, align 1 + %incdec.ptr1.i.i.i.i = getelementptr inbounds i8, i8* %p2.addr.05.i.i.i.i, i64 1 + store i8 %t43, i8* %p2.addr.05.i.i.i.i, align 1 + %exitcond.i.i.i.i = icmp eq i8* %incdec.ptr1.i.i.i.i, %add.ptr.i.i + br i1 %exitcond.i.i.i.i, label %_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit, label %while.body.i.i.i.i + +_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit: ; preds = %while.body.i.i.i.i, %vector.body.preheader + br label %_Z15IncrementalCopyPKcPcS1_.exit.i.i + +_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit30: ; preds = %while.body.i.i.i + br label %_Z15IncrementalCopyPKcPcS1_.exit.i.i + +_Z15IncrementalCopyPKcPcS1_.exit.i.i: ; preds = %_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit30, %_Z15IncrementalCopyPKcPcS1_.exit.i.i.loopexit, %vector.body.preheader.new, %if.end16.i.i.i + store i8* %add.ptr.i.i, i8** %op_ptr_.i.i, align 8 + br label %_ZN20ZippyScatteredWriter5m_fn3Emm.exit.i + +_ZN20ZippyScatteredWriter5m_fn3Emm.exit.i: ; preds = %_Z15IncrementalCopyPKcPcS1_.exit.i.i, %land.rhs.i.i, %for.cond.i + tail call void @_ZN20ZippyScatteredWriter5m_fn2Ev(%class.ZippyScatteredWriter* nonnull %tmp) + br label %for.cond.i +}