Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -129,6 +129,11 @@ "enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination")); +// The flag adds instruction count to solutions cost comparision. +static cl::opt InsnsCost( + "lsr-insns-cost", cl::Hidden, cl::init(false), + cl::desc("Add instruction count to a LSR cost model")); + #ifndef NDEBUG // Stress test IV chain generation. static cl::opt StressIVChain( @@ -323,6 +328,8 @@ bool unscale(); + bool hasZeroEnd() const; + size_t getNumRegs() const; Type *getType() const; @@ -457,6 +464,14 @@ return true; } +bool Formula::hasZeroEnd() const { + if (UnfoldedOffset || BaseOffset) + return false; + if (BaseRegs.size() != 1 || ScaledReg) + return false; + return true; +} + /// Return the total number of register operands used by this formula. This does /// not include register uses implied by non-constant addrec strides. size_t Formula::getNumRegs() const { @@ -882,6 +897,7 @@ class Cost { /// TODO: Some of these could be merged. Also, a lexical ordering /// isn't always optimal. + unsigned Insns; unsigned NumRegs; unsigned AddRecCost; unsigned NumIVMuls; @@ -892,8 +908,8 @@ public: Cost() - : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), - SetupCost(0), ScaleCost(0) {} + : Insns(0), NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), + ImmCost(0), SetupCost(0), ScaleCost(0) {} bool operator<(const Cost &Other) const; @@ -902,9 +918,9 @@ #ifndef NDEBUG // Once any of the metrics loses, they must all remain losers. bool isValid() { - return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds + return ((Insns | NumRegs | AddRecCost | NumIVMuls | NumBaseAdds | ImmCost | SetupCost | ScaleCost) != ~0u) - || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds + || ((Insns & NumRegs & AddRecCost & NumIVMuls & NumBaseAdds & ImmCost & SetupCost & ScaleCost) == ~0u); } #endif @@ -1151,6 +1167,9 @@ SmallPtrSetImpl *LoserRegs) { assert(F.isCanonical() && "Cost is accurate only for canonical formula"); // Tally up the registers. + unsigned PrevAddRecCost = AddRecCost; + unsigned PrevNumRegs = NumRegs; + unsigned PrevNumBaseAdds = NumBaseAdds; if (const SCEV *ScaledReg = F.ScaledReg) { if (VisitedRegs.count(ScaledReg)) { Lose(); @@ -1170,6 +1189,18 @@ return; } + // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as + // additional instruction (at least fill). + unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1; + if (NumRegs > TTIRegNum) { + // Cost already exceeded TTIRegNum, then only newly added register can add + // new instructions. + if (NumRegs > TTIRegNum) + Insns += (NumRegs - PrevNumRegs); + else + Insns += (NumRegs - TTIRegNum); + } + // Determine how many (unfolded) adds we'll need inside the loop. size_t NumBaseParts = F.getNumRegs(); if (NumBaseParts > 1) @@ -1198,11 +1229,30 @@ !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset)) NumBaseAdds++; } + + // If ICmpZero formula ends with not 0, it could not be replaced by + // just add or sub. We'll need to compare final result of AddRec. + // That means we'll need an additional instruction. + // For -10 + {0, +, 1}: + // i = i + 1; + // cmp i, 10 + // + // For {-10, +, 1}: + // i = i + 1; + if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd()) + Insns++; + // Each new AddRec adds 1 instruction to calculation. + Insns += (AddRecCost - PrevAddRecCost); + + // BaseAdds adds instructions for unfolded registers. + if (LU.Kind != LSRUse::ICmpZero) + Insns += NumBaseAdds - PrevNumBaseAdds; assert(isValid() && "invalid cost"); } /// Set this cost to a losing value. void Cost::Lose() { + Insns = ~0u; NumRegs = ~0u; AddRecCost = ~0u; NumIVMuls = ~0u; @@ -1214,6 +1264,8 @@ /// Choose the lower cost. bool Cost::operator<(const Cost &Other) const { + if (InsnsCost && Insns != Other.Insns) + return Insns < Other.Insns; return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, ImmCost, SetupCost) < std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls, @@ -1222,6 +1274,7 @@ } void Cost::print(raw_ostream &OS) const { + OS << Insns << " instruction" << (Insns == 1 ? " " : "s "); OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); if (AddRecCost != 0) OS << ", with addrec cost " << AddRecCost; Index: test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll +++ test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -stats 2>&1 | grep "asm-printer" | grep 7 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Check that LSR optimize compare for static counter. +; That means that instead of creating the following: +; movl %ecx, (%rdx,%rax,4) +; incq %rax +; cmpq $1024, %rax +; LSR should optimize out cmp: +; movl %ecx, 4096(%rdx,%rax) +; addq $4, %rax + +; Function Attrs: norecurse nounwind uwtable +define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %tmp1, %tmp + %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} Index: test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll +++ test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -stats 2>&1 | grep "asm-printer" | grep 11 + +; Check that LSR prefers less instructions to less registers. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Function Attrs: norecurse nounwind uwtable +define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q, i32 %n) { +entry: + %cmp10 = icmp sgt i32 %n, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %tmp1, %tmp + %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +}