Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1022,6 +1022,9 @@ int64_t MinOffset; int64_t MaxOffset; + /// Number of same base addresses. + unsigned SameBaseAddress; + /// This records whether all of the fixups using this LSRUse are outside of /// the loop, in which case some special-case heuristics may be used. bool AllFixupsOutsideLoop; @@ -1049,7 +1052,7 @@ LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN), - AllFixupsOutsideLoop(true), RigidFormula(false), + SameBaseAddress(0), AllFixupsOutsideLoop(true), RigidFormula(false), WidestFixupType(nullptr) {} LSRFixup &getNewFixup() { @@ -1183,9 +1186,11 @@ ScaleCost += getScalingFactorCost(TTI, LU, F); // Tally up the non-zero immediates. + int64_t AvgOffset = 0; for (const LSRFixup &Fixup : LU.Fixups) { int64_t O = Fixup.Offset; int64_t Offset = (uint64_t)O + F.BaseOffset; + AvgOffset += O; if (F.BaseGV) ImmCost += 64; // Handle symbolic values conservatively. // TODO: This should probably be the pointer size. @@ -1198,6 +1203,17 @@ !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset)) NumBaseAdds++; } + + if (LU.Kind == LSRUse::Address && LU.SameBaseAddress) { + if (LU.Fixups.size()) + AvgOffset /= LU.Fixups.size(); + int64_t Offset = AvgOffset + F.BaseOffset; + if (F.BaseGV) + ImmCost += 64 * LU.SameBaseAddress; + else if (F.BaseOffset != 0) + ImmCost += + APInt(64, Offset, true).getMinSignedBits() * LU.SameBaseAddress; + } assert(isValid() && "invalid cost"); } @@ -1708,7 +1724,7 @@ SmallPtrSet IVIncSet; void OptimizeShadowIV(); - bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse); + bool FindIVUserForInst(Instruction *Cond, IVStrideUse *&CondUse); ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse); void OptimizeLoopTermCond(); @@ -1743,6 +1759,8 @@ void CollectLoopInvariantFixupsAndFormulae(); + void CountSameBaseUses(); + void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, unsigned Depth = 0); @@ -1932,7 +1950,7 @@ /// If Cond has an operand that is an expression of an IV, set the IV user and /// stride information and return true, otherwise return false. -bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { +bool LSRInstance::FindIVUserForInst(Instruction *Cond, IVStrideUse *&CondUse) { for (IVStrideUse &U : IU) if (U.getUser() == Cond) { // NOTE: we could handle setcc instructions with multiple uses here, but @@ -2148,7 +2166,7 @@ // Search IVUsesByStride to find Cond's IVUse if there is one. IVStrideUse *CondUse = nullptr; ICmpInst *Cond = cast(TermBr->getCondition()); - if (!FindIVUserForCond(Cond, CondUse)) + if (!FindIVUserForInst(Cond, CondUse)) continue; // If the trip count is computed in terms of a max (due to ScalarEvolution @@ -3206,6 +3224,28 @@ } } +/// Set LU.SameBaseAddress to number of Incs for each profitable Chain. +void +LSRInstance::CountSameBaseUses() { + for (const IVChain &Chain : IVChainVec) { + // Get an LSRUse. + if (Chain.Incs.size() < 2) + continue; + IVStrideUse *IUse; + if (!FindIVUserForInst(Chain.Incs[0].UserInst, IUse)) + continue; + const SCEV *S = IU.getExpr(*IUse); + ExtractImmediate(S, SE); + LSRUse::SCEVUseKindPair P = LSRUse::SCEVUseKindPair(S, LSRUse::Address); + if (UseMap.count(P)) { + // A use already existed with this base. + size_t LUIdx = UseMap.find(P)->second; + LSRUse &LU = Uses[LUIdx]; + LU.SameBaseAddress += (Chain.Incs.size() - 1); + } + } +} + /// Split S into subexpressions which can be pulled out into separate /// registers. If C is non-null, multiply each subexpression by C. /// @@ -4883,6 +4923,9 @@ CollectFixupsAndInitialFormulae(); CollectLoopInvariantFixupsAndFormulae(); + // Calculate address uses with same base address. + CountSameBaseUses(); + assert(!Uses.empty() && "IVUsers reported at least one use"); DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n"; print_uses(dbgs())); Index: test/Transforms/LoopStrengthReduce/X86/imm-cost.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/imm-cost.ll +++ test/Transforms/LoopStrengthReduce/X86/imm-cost.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s +; The test checks that imm cost is counted for both loads %0 and %1 +; If it counts only for the first load %0 LSR will prefer to leave +; set up add. + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" + +; CHECK: for.body.preheader: +; CHECK-NEXT: br + +; Function Attrs: norecurse nounwind readonly uwtable +define i32 @foo(i32 %n, i8* nocapture readonly %a) { +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %add.ptr = getelementptr inbounds i8, i8* %a, i64 -1 + br label %for.body + +for.body: ; preds = %for.body, %for.body.preheader + %inptr.014 = phi i8* [ %incdec.ptr1, %for.body ], [ %add.ptr, %for.body.preheader ] + %s.013 = phi i32 [ %add3, %for.body ], [ 0, %for.body.preheader ] + %i.012 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i8, i8* %inptr.014, i64 1 + %tmp = load i8, i8* %inptr.014, align 1 + %conv = sext i8 %tmp to i32 + %add = add nsw i32 %conv, %s.013 + %incdec.ptr1 = getelementptr inbounds i8, i8* %inptr.014, i64 2 + %tmp1 = load i8, i8* %incdec.ptr, align 1 + %conv2 = sext i8 %tmp1 to i32 + %add3 = add nsw i32 %add, %conv2 + %inc = add nuw nsw i32 %i.012, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %s.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ] + ret i32 %s.0.lcssa +}