This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
include/llvm/Analysis/
-
llvm/
-
Analysis/
-
IVUsers.h
-
lib/
-
Analysis/
-
IVUsers.cpp
-
Transforms/Scalar/
-
Scalar/
3/6
LoopStrengthReduce.cpp
-
test/Transforms/LoopStrengthReduce/
-
Transforms/
-
LoopStrengthReduce/
-
2013-01-14-ReuseCast.ll
-
AArch64/
3/4
skip-lsr-solution.ll

Differential D46193

[LSR] Skip LSR if the cost of input is cheaper than LSR's solution
Needs ReviewPublic

Authored by junbuml on Apr 27 2018, 8:39 AM.

Download Raw Diff

Details

Reviewers

qcolombet
kparzysz
sebpop
evstupac
atrick
javed.absar
rehana
mkazantsev
sanjoy

Summary

The heuristics used in LSR cannot always guarantee LSR's final solution is the best.
Based the existing LSR's cost model, this change calculate the cost of input of LSR.
If LSR's final solution is more expensive than just using input IR, then we don't
use LSR's solution.

Diff Detail

Event Timeline

junbuml created this revision.Apr 27 2018, 8:39 AM

Herald added a reviewer: javed.absar. · View Herald TranscriptApr 27 2018, 8:39 AM

Herald added a subscriber: mcrosier. · View Herald Transcript

I like this change, thanks for implementing it!

It is useful for those kernels where the programmer knows how
to get a better set of induction variables than the suboptimal
IVs selected by the compiler heuristic.

dmgreen added a subscriber: dmgreen.Apr 28 2018, 4:11 AM

I have wanted something like this! Thanks for doing it.

test/Transforms/LoopStrengthReduce/AArch64/skip-lsr-solution.ll
2	This needs "REQUIRES: asserts".

Here's a testcase where LSR generates code that it worse than the original (on Hexagon):

#include <stdint.h>

typedef struct {
  int32_t X, Y;
  int32_t *Ptr;
} S;

int foo(S* Data, int32_t Off, int32_t Idx, int32_t *Out) {
  int32_t Col = Idx;
  if (Off >= Data->X)
    return 5;

  while (Col >= Data->Y)
    Col -= Data->Y;

  *Out = *(Data->Ptr + Col + Data->Y * Off);
  return 0;
}

Compile it with clang -target hexagon -O3. The code you added eventually punts (FormInputLSRUseAndFormula returns false), and LSR proceeds to do its thing. I did some analysis, and the problem is with %add.ptr = getelementptr inbounds i32, i32* %2, i32 %Col.0. This is not an "address use", since it goes into another GEP. It exists in the original source, but LSR never looks at it. Your code does and that makes it exit early. Maybe you should restrict the uses you look at to the same ones that LSR starts with?

Here's the IR immediately before LSR:

define dso_local i32 @foo(%struct.S* nocapture readonly %Data, i32 %Off, i32 %Idx, i32* nocapture %Out) local_unnamed_addr #0 {
entry:
  %X = getelementptr inbounds %struct.S, %struct.S* %Data, i32 0, i32 0
  %0 = load i32, i32* %X, align 4, !tbaa !2
  %cmp = icmp sgt i32 %0, %Off
  br i1 %cmp, label %while.cond.preheader, label %cleanup

while.cond.preheader:                             ; preds = %entry
  %Y = getelementptr inbounds %struct.S, %struct.S* %Data, i32 0, i32 1
  %1 = load i32, i32* %Y, align 4, !tbaa !8
  br label %while.cond

while.cond:                                       ; preds = %while.cond, %while.cond.preheader
  %Col.0 = phi i32 [ %sub, %while.cond ], [ %Idx, %while.cond.preheader ]
  %cmp1 = icmp slt i32 %Col.0, %1
  %sub = sub nsw i32 %Col.0, %1
  br i1 %cmp1, label %while.end, label %while.cond

while.end:                                        ; preds = %while.cond
  %Ptr = getelementptr inbounds %struct.S, %struct.S* %Data, i32 0, i32 2
  %2 = load i32*, i32** %Ptr, align 4, !tbaa !9
  %add.ptr = getelementptr inbounds i32, i32* %2, i32 %Col.0
  %mul = mul nsw i32 %1, %Off
  %add.ptr4 = getelementptr inbounds i32, i32* %add.ptr, i32 %mul
  %3 = load i32, i32* %add.ptr4, align 4, !tbaa !10
  store i32 %3, i32* %Out, align 4, !tbaa !10
  br label %cleanup

cleanup:                                          ; preds = %entry, %while.end
  %retval.0 = phi i32 [ 0, %while.end ], [ 5, %entry ]
  ret i32 %retval.0
}

sebpop added inline comments.May 9 2018, 11:48 AM

lib/Transforms/Scalar/LoopStrengthReduce.cpp
5727	Do you have some statistics on how many times this currently happens on a benchmark of your choice?
test/Transforms/LoopStrengthReduce/AArch64/skip-lsr-solution.ll
2	I don't see any CHECK statement depending on -debug-only, so instead of requiring asserts, let's just remove that flag. Also please remove the other flag: -lsr-insns-cost=true as I see that its default value is true: "lsr-insns-cost", cl::Hidden, cl::init(true),

kparzysz added inline comments.May 10 2018, 7:22 AM

test/Transforms/LoopStrengthReduce/AArch64/skip-lsr-solution.ll
2	It's there, the first CHECK line: `CHECK: Skip using LSR's solution`.

This patch is causing the following lit-test fails:

Builtins-i386-linux :: divsc3_test.c
LLVM :: CodeGen/X86/2006-05-11-InstrSched.ll                                                                          
LLVM :: CodeGen/X86/MergeConsecutiveStores.ll                                                                         
LLVM :: CodeGen/X86/atom-fixup-lea3.ll                                                                                
LLVM :: CodeGen/X86/conditional-tailcall.ll                                                                           
LLVM :: CodeGen/X86/loop-strength-reduce8.ll                                                                          
LLVM :: CodeGen/X86/lsr-interesting-step.ll                                                                           
LLVM :: CodeGen/X86/merge_store.ll                                                                                    
LLVM :: CodeGen/X86/misched-matrix.ll                                                                                 
LLVM :: CodeGen/X86/multiple-loop-post-inc.ll                                                                         
LLVM :: CodeGen/X86/ragreedy-hoist-spill.ll                                                                           
LLVM :: CodeGen/X86/regalloc-reconcile-broken-hints.ll
LLVM :: DebugInfo/COFF/fpo-shrink-wrap.ll
LLVM :: Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll                                                         
LLVM :: Transforms/LoopStrengthReduce/X86/ivchain-X86.ll                                                              
LLVM :: Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll                                                  
LLVM :: Transforms/LoopStrengthReduce/X86/nested-loop.ll                                                              
LLVM :: Transforms/LoopStrengthReduce/funclet.ll                                                                      
LLVM :: Transforms/LoopStrengthReduce/pr27056.ll

One of the reasons for these fails is that the current GetOrCreateLSRUse function is not taking the reference of "S" as a parameter (explained in the inline comment). Though adding "&" reduces the number of fails, followings are still failing:

Builtins-i386-linux :: divsc3_test.c
LLVM :: CodeGen/X86/conditional-tailcall.ll
LLVM :: CodeGen/X86/regalloc-reconcile-broken-hints.ll
LLVM :: DebugInfo/COFF/fpo-shrink-wrap.ll             
LLVM :: Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
LLVM :: Transforms/LoopStrengthReduce/funclet.ll             
LLVM :: Transforms/LoopStrengthReduce/pr27056.ll

There might be some other problems in the cost modelling algorithm which needs further investigation.

lib/Transforms/Scalar/LoopStrengthReduce.cpp
3357	The "S" parameter is missing the "&" and it must be added. This function calls getUse() with "S", and getUse modifies the parameter. Without the "&", the modification by getUse will not be seen by the caller of GetOrCreateLSRUse.

I tried this patch on exynos-m3 and there are several benchmarks improving by about 5%.
Among those benchmarks are spec2000 188.ammp and 256.bzip2 that improve by 3%.
All performance degradations are within noise level.

lib/Transforms/Scalar/LoopStrengthReduce.cpp
3603	s/on/one/
3620	s/getGEPExptr/getGEPExpr/

rehana added a reviewer: rehana.May 18 2018, 10:16 AM

Sorry for the extremely long delay on this change. Now I just updated the patch. Please take a look and let me know any comment.

Compile it with clang -target hexagon -O3. The code you added eventually punts (FormInputLSRUseAndFormula returns false), and LSR proceeds to do its thing. I did some analysis, and the problem is with %add.ptr = getelementptr inbounds i32, i32* %2, i32 %Col.0. This is not an "address use", since it goes into another GEP. It exists in the original source, but LSR never looks at it. Your code does and that makes it exit early. Maybe you should restrict the uses you look at to the same ones that LSR starts with?

We should consider costs of all SCEVable instructions between phi and leaf IV users. When creating the initial formula, those SCEVable instructions in the middle must be folded as a part of formula, but to find the input cost without LSR transformation we should count costs of all those instructions.

lib/Transforms/Scalar/LoopStrengthReduce.cpp
3357	Thanks for catching !
5727	For now I want to be conservative on skipping LSR even when the input cost is shown to be cheap, so I applied some weight on the input cost before comparing with the selected solution. Because of this weight, it doesn't seem to happen widely. In my test for spec2000, it impact only on 6 loops.
test/Transforms/LoopStrengthReduce/AArch64/skip-lsr-solution.ll
2	In order to force to use #instruction in the cost model for this test, we need to have -lsr-insns-cost=true specifically in the command-line because the occurrence of lsr-insns-cost is checked in Cost::isLess().

jedilyn added a subscriber: jedilyn.Sep 7 2018, 2:36 AM

mkazantsev resigned from this revision.Jul 28 2020, 7:38 AM

• ronglin added a subscriber: • ronglin.Aug 10 2020, 5:13 AM

sanjoy resigned from this revision.Jan 29 2022, 5:30 PM

Revision Contents

Path

Size

include/

llvm/

Analysis/

IVUsers.h

12 lines

lib/

Analysis/

IVUsers.cpp

4 lines

Transforms/

Scalar/

LoopStrengthReduce.cpp

365 lines

test/

Transforms/

LoopStrengthReduce/

2013-01-14-ReuseCast.ll

2 lines

AArch64/

skip-lsr-solution.ll

91 lines

Diff 164290

include/llvm/Analysis/IVUsers.h

Show All 9 Lines
// This file implements bookkeeping for "interesting" users of expressions		// This file implements bookkeeping for "interesting" users of expressions
// computed from induction variables.		// computed from induction variables.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#ifndef LLVM_ANALYSIS_IVUSERS_H		#ifndef LLVM_ANALYSIS_IVUSERS_H
#define LLVM_ANALYSIS_IVUSERS_H		#define LLVM_ANALYSIS_IVUSERS_H

		#include "llvm/ADT/SmallSet.h"
#include "llvm/Analysis/LoopAnalysisManager.h"		#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopPass.h"		#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolutionNormalization.h"		#include "llvm/Analysis/ScalarEvolutionNormalization.h"
#include "llvm/IR/ValueHandle.h"		#include "llvm/IR/ValueHandle.h"

namespace llvm {		namespace llvm {

class AssumptionCache;		class AssumptionCache;
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines
class IVUsers {		class IVUsers {
friend class IVStrideUse;		friend class IVStrideUse;
Loop *L;		Loop *L;
AssumptionCache *AC;		AssumptionCache *AC;
LoopInfo *LI;		LoopInfo *LI;
DominatorTree *DT;		DominatorTree *DT;
ScalarEvolution *SE;		ScalarEvolution *SE;
SmallPtrSet<Instruction*, 16> Processed;		SmallPtrSet<Instruction*, 16> Processed;
		DenseMap<Value, SmallPtrSet<Value, 2>> IVOperandsMap;

/// IVUses - A list of all tracked IV uses of induction variable expressions		/// IVUses - A list of all tracked IV uses of induction variable expressions
/// we are interested in.		/// we are interested in.
ilist<IVStrideUse> IVUses;		ilist<IVStrideUse> IVUses;

// Ephemeral values used by @llvm.assume in this function.		// Ephemeral values used by @llvm.assume in this function.
SmallPtrSet<const Value *, 32> EphValues;		SmallPtrSet<const Value *, 32> EphValues;

public:		public:
IVUsers(Loop L, AssumptionCache AC, LoopInfo LI, DominatorTree DT,		IVUsers(Loop L, AssumptionCache AC, LoopInfo LI, DominatorTree DT,
ScalarEvolution *SE);		ScalarEvolution *SE);

IVUsers(IVUsers &&X)		IVUsers(IVUsers &&X)
: L(std::move(X.L)), AC(std::move(X.AC)), DT(std::move(X.DT)),		: L(std::move(X.L)), AC(std::move(X.AC)), DT(std::move(X.DT)),
SE(std::move(X.SE)), Processed(std::move(X.Processed)),		SE(std::move(X.SE)), Processed(std::move(X.Processed)),
IVUses(std::move(X.IVUses)), EphValues(std::move(X.EphValues)) {		IVOperandsMap(std::move(X.IVOperandsMap)), IVUses(std::move(X.IVUses)),
		EphValues(std::move(X.EphValues)) {
for (IVStrideUse &U : IVUses)		for (IVStrideUse &U : IVUses)
U.Parent = this;		U.Parent = this;
}		}
IVUsers(const IVUsers &) = delete;		IVUsers(const IVUsers &) = delete;
IVUsers &operator=(IVUsers &&) = delete;		IVUsers &operator=(IVUsers &&) = delete;
IVUsers &operator=(const IVUsers &) = delete;		IVUsers &operator=(const IVUsers &) = delete;

Loop *getLoop() const { return L; }		Loop *getLoop() const { return L; }
Show All 21 Lines	public:
const_iterator begin() const { return IVUses.begin(); }		const_iterator begin() const { return IVUses.begin(); }
const_iterator end() const { return IVUses.end(); }		const_iterator end() const { return IVUses.end(); }
bool empty() const { return IVUses.empty(); }		bool empty() const { return IVUses.empty(); }

bool isIVUserOrOperand(Instruction *Inst) const {		bool isIVUserOrOperand(Instruction *Inst) const {
return Processed.count(Inst);		return Processed.count(Inst);
}		}

		/// Return IV operands used by \p Use.
		SmallPtrSetImpl<Value > getIVsUsedBy(Value *Use) {
		if (!IVOperandsMap.count(Use))
		return nullptr;
		return &IVOperandsMap[Use];
		}

void releaseMemory();		void releaseMemory();

void print(raw_ostream &OS, const Module * = nullptr) const;		void print(raw_ostream &OS, const Module * = nullptr) const;

/// dump - This method is used for debugging.		/// dump - This method is used for debugging.
void dump() const;		void dump() const;

protected:		protected:
Show All 40 Lines

lib/Analysis/IVUsers.cpp

Show First 20 Lines • Show All 219 Lines • ▼ Show 20 Lines	for (Use &U : I->uses()) {
if (PHINode *PHI = dyn_cast<PHINode>(User)) {		if (PHINode *PHI = dyn_cast<PHINode>(User)) {
unsigned OperandNo = U.getOperandNo();		unsigned OperandNo = U.getOperandNo();
unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);		unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
UseBB = PHI->getIncomingBlock(ValNo);		UseBB = PHI->getIncomingBlock(ValNo);
}		}
if (!isSimplifiedLoopNest(UseBB, DT, LI, SimpleLoopNests))		if (!isSimplifiedLoopNest(UseBB, DT, LI, SimpleLoopNests))
return false;		return false;

		// Track IV operands.
		IVOperandsMap[User].insert(I);

// Descend recursively, but not into PHI nodes outside the current loop.		// Descend recursively, but not into PHI nodes outside the current loop.
// It's important to see the entire expression outside the loop to get		// It's important to see the entire expression outside the loop to get
// choices that depend on addressing mode use right, although we won't		// choices that depend on addressing mode use right, although we won't
// consider references outside the loop in all cases.		// consider references outside the loop in all cases.
// If User is already in Processed, we don't want to recurse into it again,		// If User is already in Processed, we don't want to recurse into it again,
// but do want to record a second reference in the same instruction.		// but do want to record a second reference in the same instruction.
bool AddUserToIVUsers = false;		bool AddUserToIVUsers = false;
if (LI->getLoopFor(User->getParent()) != L) {		if (LI->getLoopFor(User->getParent()) != L) {
▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines

#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)		#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void IVUsers::dump() const { print(dbgs()); }		LLVM_DUMP_METHOD void IVUsers::dump() const { print(dbgs()); }
#endif		#endif

void IVUsers::releaseMemory() {		void IVUsers::releaseMemory() {
Processed.clear();		Processed.clear();
IVUses.clear();		IVUses.clear();
		IVOperandsMap.clear();
}		}

IVUsersWrapperPass::IVUsersWrapperPass() : LoopPass(ID) {		IVUsersWrapperPass::IVUsersWrapperPass() : LoopPass(ID) {
initializeIVUsersWrapperPassPass(*PassRegistry::getPassRegistry());		initializeIVUsersWrapperPassPass(*PassRegistry::getPassRegistry());
}		}

void IVUsersWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {		void IVUsersWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<AssumptionCacheTracker>();		AU.addRequired<AssumptionCacheTracker>();
▲ Show 20 Lines • Show All 68 Lines • Show Last 20 Lines

lib/Transforms/Scalar/LoopStrengthReduce.cpp

Show First 20 Lines • Show All 59 Lines • ▼ Show 20 Lines
#include "llvm/ADT/Hashing.h"		#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/PointerIntPair.h"		#include "llvm/ADT/PointerIntPair.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"		#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"		#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"		#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"		#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"		#include "llvm/ADT/SmallVector.h"
		#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator_range.h"		#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/IVUsers.h"		#include "llvm/Analysis/IVUsers.h"
#include "llvm/Analysis/LoopAnalysisManager.h"		#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"		#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"		#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"		#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"		#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"		#include "llvm/Analysis/ScalarEvolutionExpressions.h"
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
#include <limits>		#include <limits>
#include <map>		#include <map>
#include <utility>		#include <utility>

using namespace llvm;		using namespace llvm;

#define DEBUG_TYPE "loop-reduce"		#define DEBUG_TYPE "loop-reduce"

		STATISTIC(NumCheaperInput, "Number of cheaper LSR input");

/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for		/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
/// bail out. This threshold is far beyond the number of users that LSR can		/// bail out. This threshold is far beyond the number of users that LSR can
/// conceivably solve, so it should not affect generated code, but catches the		/// conceivably solve, so it should not affect generated code, but catches the
/// worst cases before LSR burns too much compile time and stack space.		/// worst cases before LSR burns too much compile time and stack space.
static const unsigned MaxIVUsers = 200;		static const unsigned MaxIVUsers = 200;

		static cl::opt<bool>
		EnableInitalCostCheck("enable-lsr-input-cost-check", cl::Hidden,
		cl::init(true),
		cl::desc("Enable LSR input cost check"));

// Temporary flag to cleanup congruent phis after LSR phi expansion.		// Temporary flag to cleanup congruent phis after LSR phi expansion.
// It's currently disabled until we can determine whether it's truly useful or		// It's currently disabled until we can determine whether it's truly useful or
// not. The flag should be removed after the v3.0 release.		// not. The flag should be removed after the v3.0 release.
// This is now needed for ivchains.		// This is now needed for ivchains.
static cl::opt<bool> EnablePhiElim(		static cl::opt<bool> EnablePhiElim(
"enable-lsr-phielim", cl::Hidden, cl::init(true),		"enable-lsr-phielim", cl::Hidden, cl::init(true),
cl::desc("Enable LSR phi elimination"));		cl::desc("Enable LSR phi elimination"));

▲ Show 20 Lines • Show All 875 Lines • ▼ Show 20 Lines	Cost() {
C.SetupCost = 0;		C.SetupCost = 0;
C.ScaleCost = 0;		C.ScaleCost = 0;
}		}

bool isLess(Cost &Other, const TargetTransformInfo &TTI);		bool isLess(Cost &Other, const TargetTransformInfo &TTI);

void Lose();		void Lose();

		void ApplyWeight();

#ifndef NDEBUG		#ifndef NDEBUG
// Once any of the metrics loses, they must all remain losers.		// Once any of the metrics loses, they must all remain losers.
bool isValid() {		bool isValid() {
return ((C.Insns \| C.NumRegs \| C.AddRecCost \| C.NumIVMuls \| C.NumBaseAdds		return ((C.Insns \| C.NumRegs \| C.AddRecCost \| C.NumIVMuls \| C.NumBaseAdds
\| C.ImmCost \| C.SetupCost \| C.ScaleCost) != ~0u)		\| C.ImmCost \| C.SetupCost \| C.ScaleCost) != ~0u)
\|\| ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds		\|\| ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
& C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);		& C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
}		}
▲ Show 20 Lines • Show All 368 Lines • ▼ Show 20 Lines	void Cost::Lose() {
C.AddRecCost = std::numeric_limits<unsigned>::max();		C.AddRecCost = std::numeric_limits<unsigned>::max();
C.NumIVMuls = std::numeric_limits<unsigned>::max();		C.NumIVMuls = std::numeric_limits<unsigned>::max();
C.NumBaseAdds = std::numeric_limits<unsigned>::max();		C.NumBaseAdds = std::numeric_limits<unsigned>::max();
C.ImmCost = std::numeric_limits<unsigned>::max();		C.ImmCost = std::numeric_limits<unsigned>::max();
C.SetupCost = std::numeric_limits<unsigned>::max();		C.SetupCost = std::numeric_limits<unsigned>::max();
C.ScaleCost = std::numeric_limits<unsigned>::max();		C.ScaleCost = std::numeric_limits<unsigned>::max();
}		}

		/// Apply some weight on this cost.
		void Cost::ApplyWeight() {
		// FIXME: For now, simply increase Insns and NumRegs by one and ignore other
		// cost criteria.
		++C.Insns;
		++C.NumRegs;
		C.AddRecCost = std::numeric_limits<unsigned>::max();
		C.NumIVMuls = std::numeric_limits<unsigned>::max();
		C.NumBaseAdds = std::numeric_limits<unsigned>::max();
		C.ImmCost = std::numeric_limits<unsigned>::max();
		C.SetupCost = std::numeric_limits<unsigned>::max();
		C.ScaleCost = std::numeric_limits<unsigned>::max();
		}

/// Choose the lower cost.		/// Choose the lower cost.
bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {		bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {
if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&		if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
C.Insns != Other.C.Insns)		C.Insns != Other.C.Insns)
return C.Insns < Other.C.Insns;		return C.Insns < Other.C.Insns;
return TTI.isLSRCostLess(C, Other.C);		return TTI.isLSRCostLess(C, Other.C);
}		}

▲ Show 20 Lines • Show All 526 Lines • ▼ Show 20 Lines	class LSRInstance {
UseMapTy UseMap;		UseMapTy UseMap;

bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,		bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
LSRUse::KindType Kind, MemAccessTy AccessTy);		LSRUse::KindType Kind, MemAccessTy AccessTy);

std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,		std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
MemAccessTy AccessTy);		MemAccessTy AccessTy);

		std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
		MemAccessTy AccessTy,
		SmallVectorImpl<LSRUse> &LsrUses,
		UseMapTy &UseMap);

		LSRUse &GetOrCreateLSRUse(Instruction UserInst, Value IVOp, const SCEV *&S,
		LSRUse::KindType Kind, MemAccessTy &AccessTy,
		PostIncLoopSet &TmpPostIncLoops, UseMapTy &UseMap,
		SmallVectorImpl<LSRUse> &LsrUses, size_t &LUIdx);

		bool FormInputGEPFormula(Value *Op, LSRUse &LU, Formula &F);

		bool
		FormInputLSRUsesAndFormulae(Instruction UserInst, Value OperandValToReplace,
		DenseSet<std::pair<Value , Value >> &Visited,
		SmallVectorImpl<LSRUse> &InputUses,
		UseMapTy &InputUseMap);

		bool FormInputLSRUseAndFormula(Instruction UserInst, Value Op,
		DenseSet<std::pair<Value , Value >> &Visited,
		SmallVectorImpl<LSRUse> &InputUses,
		UseMapTy &InputUseMap);

		bool CollectInputFormulae(SmallVectorImpl<LSRUse> &LSRUses);

		bool IsInputCostCheaperThanSolutionCost(Cost &SolutionCost);

		bool IsInputCostStillCheap(Cost &SolutionCost, Cost &InputCost);

void DeleteUse(LSRUse &LU, size_t LUIdx);		void DeleteUse(LSRUse &LU, size_t LUIdx);

LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);		LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);

void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);		void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);		void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
void CountRegisters(const Formula &F, size_t LUIdx);		void CountRegisters(const Formula &F, size_t LUIdx);
bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);		bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F,
		bool CountReg = true);

void CollectLoopInvariantFixupsAndFormulae();		void CollectLoopInvariantFixupsAndFormulae();

void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,		void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
unsigned Depth = 0);		unsigned Depth = 0);

void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,		void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
const Formula &Base, unsigned Depth,		const Formula &Base, unsigned Depth,
Show All 26 Lines	class LSRInstance {
void NarrowSearchSpaceUsingHeuristics();		void NarrowSearchSpaceUsingHeuristics();

void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,		void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
Cost &SolutionCost,		Cost &SolutionCost,
SmallVectorImpl<const Formula *> &Workspace,		SmallVectorImpl<const Formula *> &Workspace,
const Cost &CurCost,		const Cost &CurCost,
const SmallPtrSet<const SCEV *, 16> &CurRegs,		const SmallPtrSet<const SCEV *, 16> &CurRegs,
DenseSet<const SCEV *> &VisitedRegs) const;		DenseSet<const SCEV *> &VisitedRegs) const;
void Solve(SmallVectorImpl<const Formula *> &Solution) const;
		void Solve(SmallVectorImpl<const Formula *> &Solution,
		Cost &SolutionCost) const;

BasicBlock::iterator		BasicBlock::iterator
HoistInsertPosition(BasicBlock::iterator IP,		HoistInsertPosition(BasicBlock::iterator IP,
const SmallVectorImpl<Instruction *> &Inputs) const;		const SmallVectorImpl<Instruction *> &Inputs) const;
BasicBlock::iterator		BasicBlock::iterator
AdjustInsertPositionForExpand(BasicBlock::iterator IP,		AdjustInsertPositionForExpand(BasicBlock::iterator IP,
const LSRFixup &LF,		const LSRFixup &LF,
const LSRUse &LU,		const LSRUse &LU,
▲ Show 20 Lines • Show All 519 Lines • ▼ Show 20 Lines
}		}

/// Return an LSRUse index and an offset value for a fixup which needs the given		/// Return an LSRUse index and an offset value for a fixup which needs the given
/// expression, with the given kind and optional access type. Either reuse an		/// expression, with the given kind and optional access type. Either reuse an
/// existing use or create a new one, as needed.		/// existing use or create a new one, as needed.
std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,		std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
LSRUse::KindType Kind,		LSRUse::KindType Kind,
MemAccessTy AccessTy) {		MemAccessTy AccessTy) {
		return getUse(Expr, Kind, AccessTy, Uses, UseMap);
		}

		std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
		LSRUse::KindType Kind,
		MemAccessTy AccessTy,
		SmallVectorImpl<LSRUse> &LUs,
		UseMapTy &UMap) {
const SCEV *Copy = Expr;		const SCEV *Copy = Expr;
int64_t Offset = ExtractImmediate(Expr, SE);		int64_t Offset = ExtractImmediate(Expr, SE);

// Basic uses can't accept any offset, for example.		// Basic uses can't accept any offset, for example.
if (!isAlwaysFoldable(TTI, Kind, AccessTy, /BaseGV=/ nullptr,		if (!isAlwaysFoldable(TTI, Kind, AccessTy, /BaseGV=/ nullptr,
Offset, /HasBaseReg=/ true)) {		Offset, /HasBaseReg=/ true)) {
Expr = Copy;		Expr = Copy;
Offset = 0;		Offset = 0;
}		}

std::pair<UseMapTy::iterator, bool> P =		std::pair<UseMapTy::iterator, bool> P =
UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));		UMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));

if (!P.second) {		if (!P.second) {
// A use already existed with this base.		// A use already existed with this base.
size_t LUIdx = P.first->second;		size_t LUIdx = P.first->second;
LSRUse &LU = Uses[LUIdx];		LSRUse &LU = LUs[LUIdx];
if (reconcileNewOffset(LU, Offset, /HasBaseReg=/true, Kind, AccessTy))		if (reconcileNewOffset(LU, Offset, /HasBaseReg=/true, Kind, AccessTy))
// Reuse this use.		// Reuse this use.
return std::make_pair(LUIdx, Offset);		return std::make_pair(LUIdx, Offset);
}		}

// Create a new use.		// Create a new use.
size_t LUIdx = Uses.size();		size_t LUIdx = LUs.size();
P.first->second = LUIdx;		P.first->second = LUIdx;
Uses.push_back(LSRUse(Kind, AccessTy));		LUs.push_back(LSRUse(Kind, AccessTy));
LSRUse &LU = Uses[LUIdx];		LSRUse &LU = LUs[LUIdx];

LU.MinOffset = Offset;		LU.MinOffset = Offset;
LU.MaxOffset = Offset;		LU.MaxOffset = Offset;
return std::make_pair(LUIdx, Offset);		return std::make_pair(LUIdx, Offset);
}		}

/// Delete the given use from the Uses list.		/// Delete the given use from the Uses list.
void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {		void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
▲ Show 20 Lines • Show All 630 Lines • ▼ Show 20 Lines	for (PHINode &Phi : L->getHeader()->phis()) {
IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");		IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
}		}
Phi.replaceUsesOfWith(PostIncV, IVOper);		Phi.replaceUsesOfWith(PostIncV, IVOper);
DeadInsts.emplace_back(PostIncV);		DeadInsts.emplace_back(PostIncV);
}		}
}		}
}		}

		bool LSRInstance::FormInputLSRUsesAndFormulae(
		Instruction UserInst, Value OperandValToReplace,
		DenseSet<std::pair<Value , Value >> &Visited,
		SmallVectorImpl<LSRUse> &InputUses, UseMapTy &InputUseMap) {
		SmallVector<std::pair<Instruction , Value >, 4> Worklist;
		Worklist.push_back(std::make_pair(UserInst, OperandValToReplace));

		while (!Worklist.empty()) {
		std::pair<Instruction , Value > Element = Worklist.pop_back_val();
		Value *Op = Element.second;
		Instruction *Inst = Element.first;

		if (!FormInputLSRUseAndFormula(Inst, Op, Visited, InputUses, InputUseMap))
		return false;

		if (!IU.getIVsUsedBy(Op)) {
		assert(isa<PHINode>(Op) && "Expect only a PHI is the root of IV operand");
		continue;
		}
		for (auto IV : *IU.getIVsUsedBy(Op))
		Worklist.push_back(std::make_pair(cast<Instruction>(Op), IV));
		}
		return true;
		}

		bool LSRInstance::FormInputLSRUseAndFormula(
		Instruction UserInst, Value Op,
		DenseSet<std::pair<Value , Value >> &Visited,
		SmallVectorImpl<LSRUse> &InputUses, UseMapTy &InputUseMap) {
		if (!Visited.insert(std::make_pair(UserInst, Op)).second)
		return true;

		LSRUse::KindType Kind = LSRUse::Basic;
		MemAccessTy AccessTy;
		if (UserInst && isAddressUse(TTI, UserInst, Op)) {
		Kind = LSRUse::Address;
		AccessTy = getAccessType(TTI, UserInst, Op);
		}

		const SCEV *S = SE.getSCEV(Op);
		size_t LUIdx;
		PostIncLoopSet TmpPostIncLoops;
		LSRUse &LU =
		GetOrCreateLSRUse(UserInst, Op, S, Kind, AccessTy, TmpPostIncLoops,
		InputUseMap, InputUses, LUIdx);

		Formula F;

		if (isa<GetElementPtrInst>(Op)) {
		// TODO: For now we only consider GEPs directly used in addressing.
		if (!isAddressUse(TTI, UserInst, Op))
		return false;
		// Since we need to model the situation where IV operands are all kept we
		// should break down GEP's SCEV in terms of its IV operands so that we can
		// reuse Regs from IV operands in GEP. For example, in IR below,
		//
		// %iv = i32 phi [0, %Preheader],[%iv.next, %LoopBody]
		// getelementptr i32* %Base, i32 %iv
		// %iv.next = add i32 %iv, #1
		//
		// we cannot simply use the original initial formula (reg(%Base,+,4)) from
		// SE.getSCEV(). Instead, we have to break down the formula in terms of the
		// IV operand like reg(%Base) + 4*reg(0,+,1) so that we can let the cost
		// model capture reg(0,+,1) is shared.
		if (!FormInputGEPFormula(Op, LU, F))
		return false;
		} else
		F.initialMatch(S, L, SE);

		if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
		return false;

		InsertFormula(LU, LUIdx, F, /* CountReg= */ false);

		// TODO: For now, expect only one formula for a LU. If we encounter multiple
		// formulae, it must be due to other GEPs in the same LU created in different
		// form in FormInputGEPFormula(). We do not handle such case for now.
		if (LU.Formulae.size() != 1)
		return false;

		return true;
		}

		LSRUse &LSRInstance::GetOrCreateLSRUse(
		Instruction UserInst, Value IVOp, const SCEV *&S, LSRUse::KindType Kind,
		MemAccessTy &AccessTy, PostIncLoopSet &TmpPostIncLoops, UseMapTy &LsrUseMap,
		rehanaUnsubmitted Done Reply Inline Actions The "S" parameter is missing the "&" and it must be added. This function calls getUse() with "S", and getUse modifies the parameter. Without the "&", the modification by getUse will not be seen by the caller of GetOrCreateLSRUse. rehana: The "S" parameter is missing the "&" and it must be added. This function calls getUse() with…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions Thanks for catching ! junbuml: Thanks for catching !
		SmallVectorImpl<LSRUse> &LsrUses, size_t &LUIdx) {
		// Get or create an LSRUse.
		std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy, LsrUses, LsrUseMap);
		LUIdx = P.first;
		int64_t Offset = P.second;
		LSRUse &LU = LsrUses[LUIdx];

		// Record the fixup.
		LSRFixup &LF = LU.getNewFixup();
		LF.UserInst = UserInst;
		LF.OperandValToReplace = IVOp;
		LF.PostIncLoops = TmpPostIncLoops;
		LF.Offset = Offset;
		LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);

		if (!LU.WidestFixupType \|\|
		SE.getTypeSizeInBits(LU.WidestFixupType) <
		SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
		LU.WidestFixupType = LF.OperandValToReplace->getType();

		return LU;
		}

		bool LSRInstance::CollectInputFormulae(SmallVectorImpl<LSRUse> &InputUses) {
		DenseSet<std::pair<Value , Value >> Visited;
		UseMapTy InputUseMap;

		for (const IVStrideUse &U : IU)
		if (!FormInputLSRUsesAndFormulae(U.getUser(), U.getOperandValToReplace(),
		Visited, InputUses, InputUseMap))
		return false;
		return true;
		}

void LSRInstance::CollectFixupsAndInitialFormulae() {		void LSRInstance::CollectFixupsAndInitialFormulae() {
for (const IVStrideUse &U : IU) {		for (const IVStrideUse &U : IU) {
Instruction *UserInst = U.getUser();		Instruction *UserInst = U.getUser();
// Skip IV users that are part of profitable IV Chains.		// Skip IV users that are part of profitable IV Chains.
User::op_iterator UseI =		User::op_iterator UseI =
find(UserInst->operands(), U.getOperandValToReplace());		find(UserInst->operands(), U.getOperandValToReplace());
assert(UseI != UserInst->op_end() && "cannot find IV operand");		assert(UseI != UserInst->op_end() && "cannot find IV operand");
if (IVIncSet.count(UseI)) {		if (IVIncSet.count(UseI)) {
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
// -1 and the negations of all interesting strides (except the negation		// -1 and the negations of all interesting strides (except the negation
// of -1) are now also interesting.		// of -1) are now also interesting.
for (size_t i = 0, e = Factors.size(); i != e; ++i)		for (size_t i = 0, e = Factors.size(); i != e; ++i)
if (Factors[i] != -1)		if (Factors[i] != -1)
Factors.insert(-(uint64_t)Factors[i]);		Factors.insert(-(uint64_t)Factors[i]);
Factors.insert(-1);		Factors.insert(-1);
}		}

// Get or create an LSRUse.		size_t LUIdx;
std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);		LSRUse &LU =
size_t LUIdx = P.first;		GetOrCreateLSRUse(UserInst, U.getOperandValToReplace(), S, Kind,
int64_t Offset = P.second;		AccessTy, TmpPostIncLoops, UseMap, Uses, LUIdx);
LSRUse &LU = Uses[LUIdx];

// Record the fixup.
LSRFixup &LF = LU.getNewFixup();
LF.UserInst = UserInst;
LF.OperandValToReplace = U.getOperandValToReplace();
LF.PostIncLoops = TmpPostIncLoops;
LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);

if (!LU.WidestFixupType \|\|
SE.getTypeSizeInBits(LU.WidestFixupType) <
SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
LU.WidestFixupType = LF.OperandValToReplace->getType();

// If this is the first use of this LSRUse, give it a formula.		// If this is the first use of this LSRUse, give it a formula.
if (LU.Formulae.empty()) {		if (LU.Formulae.empty()) {
InsertInitialFormula(S, LU, LUIdx);		InsertInitialFormula(S, LU, LUIdx);
CountRegisters(LU.Formulae.back(), LUIdx);		CountRegisters(LU.Formulae.back(), LUIdx);
}		}
}		}

Show All 31 Lines	void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
if (F.ScaledReg)		if (F.ScaledReg)
RegUses.countRegister(F.ScaledReg, LUIdx);		RegUses.countRegister(F.ScaledReg, LUIdx);
for (const SCEV *BaseReg : F.BaseRegs)		for (const SCEV *BaseReg : F.BaseRegs)
RegUses.countRegister(BaseReg, LUIdx);		RegUses.countRegister(BaseReg, LUIdx);
}		}

/// If the given formula has not yet been inserted, add it to the list, and		/// If the given formula has not yet been inserted, add it to the list, and
/// return true. Return false otherwise.		/// return true. Return false otherwise.
bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {		bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F,
		bool CountReg) {
// Do not insert formula that we will not be able to expand.		// Do not insert formula that we will not be able to expand.
assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&		assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
"Formula is illegal");		"Formula is illegal");

if (!LU.InsertFormula(F, *L))		if (!LU.InsertFormula(F, *L))
return false;		return false;

		if (CountReg)
CountRegisters(F, LUIdx);		CountRegisters(F, LUIdx);
return true;		return true;
}		}

		bool LSRInstance::FormInputGEPFormula(Value *Op, LSRUse &LU, Formula &F) {
		assert(IU.getIVsUsedBy(Op) && "Expect at least one IV operand in GEP");
		GEPOperator *GEP = cast<GEPOperator>(Op);
		SmallVector<const SCEV *, 4> NonIVOpExprs;
		SmallVector<std::pair<const SCEV , const SCEV >, 4> IVOpExprs;
		const SCEV *BaseExpr = SE.getSCEV(GEP->getPointerOperand());
		Type *IntPtrTy = SE.getEffectiveSCEVType(BaseExpr->getType());
		SCEV::NoWrapFlags Wrap =
		GEP->isInBounds() ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
		Type *CurTy = ArrayType::get(GEP->getSourceElementType(), 0);

		if (!IU.getIVsUsedBy(Op)->count(GEP->getPointerOperand()))
		NonIVOpExprs.push_back(BaseExpr);
		else
		IVOpExprs.push_back(std::make_pair(SE.getOne(IntPtrTy), BaseExpr));

		// Note that this loop is inspired by SE.getGEPExpr().
		for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index) {
		Value GEPIndex = Index;
		const SCEV *IndexExpr = SE.getSCEV(GEPIndex);

		// Compute the (potentially symbolic) offset in bytes for this index.
		if (StructType *STy = dyn_cast<StructType>(CurTy)) {
		// For a struct, add the member offset.
		ConstantInt *Index = cast<SCEVConstant>(IndexExpr)->getValue();
		unsigned FieldNo = Index->getZExtValue();
		const SCEV *FieldOffset = SE.getOffsetOfExpr(IntPtrTy, STy, FieldNo);

		// The field offset will be added as a part of NonIVOps.
		NonIVOpExprs.push_back(FieldOffset);

		// Update CurTy to the type of the field at Index.
		CurTy = STy->getTypeAtIndex(Index);
		} else {
		// Update CurTy to its element type.
		CurTy = cast<SequentialType>(CurTy)->getElementType();
		// For an array, find the element offset, explicitly scaled.
		const SCEV *ElementSize = SE.getSizeOfExpr(IntPtrTy, CurTy);
		// Getelementptr indices are signed.
		IndexExpr = SE.getTruncateOrSignExtend(IndexExpr, IntPtrTy);

		// Multiply the index by the element size to compute the element offset.
		const SCEV *LocalOffset = SE.getMulExpr(IndexExpr, ElementSize, Wrap);

		if (!IU.getIVsUsedBy(Op)->count(GEPIndex))
		NonIVOpExprs.push_back(LocalOffset);
		else
		IVOpExprs.push_back(std::make_pair(ElementSize, IndexExpr));
		}
		}

		const SCEV *NonIVOps;
		if (NonIVOpExprs.size() < 1)
		NonIVOps = SE.getZero(IntPtrTy);
		else
		NonIVOps = SE.getAddExpr(NonIVOpExprs);

		Formula TempF;
		TempF.initialMatch(NonIVOps, L, SE);

		// Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2 because we
		// want to place IV in ScaleReg.
		TempF.unscale();

		for (auto IVOpExpr : IVOpExprs) {
		const SCEV *ElementSize = IVOpExpr.first;
		const SCEV *IVOp = IVOpExpr.second;
		int64_t Scale =
		(cast<SCEVConstant>(ElementSize))->getAPInt().getSExtValue();

		if (!TempF.BaseRegs.empty() && !TempF.ScaledReg) {
		TempF.Scale = Scale;
		TempF.ScaledReg = IVOp;
		} else {
		const SCEV *Offset = SE.getMulExpr(IVOp, ElementSize, Wrap);
		TempF.BaseRegs.push_back(Offset);
		TempF.HasBaseReg = true;
		}
		}

		if (!TempF.isCanonical(*L))
		return false;

		F = TempF;
		return true;
		}

/// Check for other uses of loop-invariant values which we're tracking. These		/// Check for other uses of loop-invariant values which we're tracking. These
/// other uses will pin these values in registers, making them less profitable		/// other uses will pin these values in registers, making them less profitable
		sebpopUnsubmitted Not Done Reply Inline Actions s/on/one/ sebpop: s/on/one/
/// for elimination.		/// for elimination.
/// TODO: This currently misses non-constant addrec step registers.		/// TODO: This currently misses non-constant addrec step registers.
/// TODO: Should this give more weight to users inside the loop?		/// TODO: Should this give more weight to users inside the loop?
void		void
LSRInstance::CollectLoopInvariantFixupsAndFormulae() {		LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());		SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
SmallPtrSet<const SCEV *, 32> Visited;		SmallPtrSet<const SCEV *, 32> Visited;

while (!Worklist.empty()) {		while (!Worklist.empty()) {
const SCEV *S = Worklist.pop_back_val();		const SCEV *S = Worklist.pop_back_val();

// Don't process the same SCEV twice		// Don't process the same SCEV twice
if (!Visited.insert(S).second)		if (!Visited.insert(S).second)
continue;		continue;

if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))		if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
Worklist.append(N->op_begin(), N->op_end());		Worklist.append(N->op_begin(), N->op_end());
		sebpopUnsubmitted Done Reply Inline Actions s/getGEPExptr/getGEPExpr/ sebpop: s/getGEPExptr/getGEPExpr/
else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))		else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
Worklist.push_back(C->getOperand());		Worklist.push_back(C->getOperand());
else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {		else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
Worklist.push_back(D->getLHS());		Worklist.push_back(D->getLHS());
Worklist.push_back(D->getRHS());		Worklist.push_back(D->getRHS());
} else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {		} else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
const Value *V = US->getValue();		const Value *V = US->getValue();
if (const Instruction *Inst = dyn_cast<Instruction>(V)) {		if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
▲ Show 20 Lines • Show All 1,467 Lines • ▼ Show 20 Lines	if (NewCost.isLess(SolutionCost, TTI)) {
}		}
Workspace.pop_back();		Workspace.pop_back();
}		}
}		}
}		}

/// Choose one formula from each use. Return the results in the given Solution		/// Choose one formula from each use. Return the results in the given Solution
/// vector.		/// vector.
void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {		void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution,
		Cost &SolutionCost) const {
SmallVector<const Formula *, 8> Workspace;		SmallVector<const Formula *, 8> Workspace;
Cost SolutionCost;
SolutionCost.Lose();		SolutionCost.Lose();
Cost CurCost;		Cost CurCost;
SmallPtrSet<const SCEV *, 16> CurRegs;		SmallPtrSet<const SCEV *, 16> CurRegs;
DenseSet<const SCEV *> VisitedRegs;		DenseSet<const SCEV *> VisitedRegs;
Workspace.reserve(Uses.size());		Workspace.reserve(Uses.size());

// SolveRecurse does all the work.		// SolveRecurse does all the work.
SolveRecurse(Solution, SolutionCost, Workspace, CurCost,		SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
Show All 14 Lines	LLVM_DEBUG(dbgs() << "\n"
" ";		" ";
Solution[i]->print(dbgs());		Solution[i]->print(dbgs());
dbgs() << '\n';		dbgs() << '\n';
});		});

assert(Solution.size() == Uses.size() && "Malformed solution!");		assert(Solution.size() == Uses.size() && "Malformed solution!");
}		}

		bool LSRInstance::IsInputCostStillCheap(Cost &SolutionCost, Cost &InputCost) {
		if (!InputCost.isLess(SolutionCost, TTI))
		return false;

		// By giving some weight on the input cost, we conservatively skip using
		// LSR's selected solution.
		Cost WeightedInputCost = InputCost;
		WeightedInputCost.ApplyWeight();
		return WeightedInputCost.isLess(SolutionCost, TTI);
		}

		bool LSRInstance::IsInputCostCheaperThanSolutionCost(Cost &SolutionCost) {
		SmallVector<LSRUse, 16> InputUses;
		if (!CollectInputFormulae(InputUses) \|\| InputUses.size() < 1)
		return false;

		DenseSet<const SCEV *> VisitedRegs;
		SmallPtrSet<const SCEV *, 16> Regs;
		Cost InputCost;
		for (size_t i = 0, e = InputUses.size(); i != e; ++i) {
		const LSRUse &LU = InputUses[i];
		assert(LU.Formulae.size() == 1 && "Expect only one formula");
		InputCost.RateFormula(TTI, LU.Formulae[0], Regs, VisitedRegs, L, SE, DT,
		LU);

		if (!IsInputCostStillCheap(SolutionCost, InputCost))
		return false;
		}

		LLVM_DEBUG(dbgs() << "\n"
		"The input requires ";
		InputCost.print(dbgs()); dbgs() << ":\n"
		"The chosen solution requires ";
		SolutionCost.print(dbgs()); dbgs() << ":\n";);
		return true;
		}

/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as		/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
/// we can go while still being dominated by the input positions. This helps		/// we can go while still being dominated by the input positions. This helps
/// canonicalize the insert position, which encourages sharing.		/// canonicalize the insert position, which encourages sharing.
BasicBlock::iterator		BasicBlock::iterator
LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,		LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
const SmallVectorImpl<Instruction *> &Inputs)		const SmallVectorImpl<Instruction *> &Inputs)
const {		const {
Instruction Tentative = &IP;		Instruction Tentative = &IP;
▲ Show 20 Lines • Show All 524 Lines • ▼ Show 20 Lines	#endif // DEBUG
// Now use the reuse data to generate a bunch of interesting ways		// Now use the reuse data to generate a bunch of interesting ways
// to formulate the values needed for the uses.		// to formulate the values needed for the uses.
GenerateAllReuseFormulae();		GenerateAllReuseFormulae();

FilterOutUndesirableDedicatedRegisters();		FilterOutUndesirableDedicatedRegisters();
NarrowSearchSpaceUsingHeuristics();		NarrowSearchSpaceUsingHeuristics();

SmallVector<const Formula *, 8> Solution;		SmallVector<const Formula *, 8> Solution;
Solve(Solution);		Cost SolutionCost;
		Solve(Solution, SolutionCost);

// Release memory that is no longer needed.		// Release memory that is no longer needed.
Factors.clear();		Factors.clear();
Types.clear();		Types.clear();
RegUses.clear();		RegUses.clear();

if (Solution.empty())		if (Solution.empty())
return;		return;

		if (EnableInitalCostCheck &&
		IsInputCostCheaperThanSolutionCost(SolutionCost)) {
		++NumCheaperInput;
		sebpopUnsubmitted Done Reply Inline Actions Do you have some statistics on how many times this currently happens on a benchmark of your choice? sebpop: Do you have some statistics on how many times this currently happens on a benchmark of your…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions For now I want to be conservative on skipping LSR even when the input cost is shown to be cheap, so I applied some weight on the input cost before comparing with the selected solution. Because of this weight, it doesn't seem to happen widely. In my test for spec2000, it impact only on 6 loops. junbuml: For now I want to be conservative on skipping LSR even when the input cost is shown to be cheap…
		LLVM_DEBUG(dbgs() << "Skip using LSR's solution.\n");
		return;
		}

#ifndef NDEBUG		#ifndef NDEBUG
// Formulae should be legal.		// Formulae should be legal.
for (const LSRUse &LU : Uses) {		for (const LSRUse &LU : Uses) {
for (const Formula &F : LU.Formulae)		for (const Formula &F : LU.Formulae)
assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,		assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
F) && "Illegal formula generated!");		F) && "Illegal formula generated!");
};		};
#endif		#endif
▲ Show 20 Lines • Show All 164 Lines • Show Last 20 Lines

test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll

	; RUN: opt -loop-reduce -S < %s \| FileCheck %s			; RUN: opt -loop-reduce -S -enable-lsr-input-cost-check=false < %s \| FileCheck %s
	;			;
	; LTO of clang, which mistakenly uses no TargetLoweringInfo, causes a			; LTO of clang, which mistakenly uses no TargetLoweringInfo, causes a
	; miscompile. ReuseOrCreateCast replace ptrtoint operand with undef.			; miscompile. ReuseOrCreateCast replace ptrtoint operand with undef.
	; Reproducing the miscompile requires no triple, hence no "TTI".			; Reproducing the miscompile requires no triple, hence no "TTI".
	; rdar://13007381			; rdar://13007381

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

	▲ Show 20 Lines • Show All 75 Lines • Show Last 20 Lines

test/Transforms/LoopStrengthReduce/AArch64/skip-lsr-solution.ll

This file was added.

				; REQUIRES: asserts
				; RUN: llc < %s -mtriple=aarch64 -lsr-insns-cost=true -debug-only=loop-reduce 2>&1 \| FileCheck %s
				kparzyszUnsubmitted Done Reply Inline Actions This needs "REQUIRES: asserts". kparzysz: This needs "REQUIRES: asserts".
				sebpopUnsubmitted Done Reply Inline Actions I don't see any CHECK statement depending on -debug-only, so instead of requiring asserts, let's just remove that flag. Also please remove the other flag: -lsr-insns-cost=true as I see that its default value is true: "lsr-insns-cost", cl::Hidden, cl::init(true), sebpop: I don't see any CHECK statement depending on -debug-only, so instead of requiring asserts…
				kparzyszUnsubmitted Done Reply Inline Actions It's there, the first CHECK line: `CHECK: Skip using LSR's solution`. kparzysz: It's there, the first CHECK line: `CHECK: Skip using LSR's solution`.
				junbumlAuthorUnsubmitted Not Done Reply Inline Actions In order to force to use #instruction in the cost model for this test, we need to have -lsr-insns-cost=true specifically in the command-line because the occurrence of lsr-insns-cost is checked in Cost::isLess(). junbuml: In order to force to use #instruction in the cost model for this test, we need to have -lsr…

				; ModuleID = 'loophmmer.c'
				source_filename = "loophmmer.c"
				target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
				target triple = "aarch64-dcg-linux-gnu"

				;void test(int M, int P, int P2, int P3, int P4,
				; int P5, int P6, int P7, int P8) {
				;
				; int t = 0;
				; for (int k = 1; k <= M; ++k) {
				; P[k-1] = P[k-1] + P2[k-1];
				; P[k-1] = P3[k-1] + P4[k-1];
				; P[k-1] = P6[k-1] + P7[k-1];
				;
				; if (k < M) {
				; P6[k] = P8[k] + P[k];
				; P[k] = P4[k] + P5[k];
				; }
				; }
				;}


				; CHECK: Skip using LSR's solution
				; CHECK-LABEL: test:
				; CHECK: lsl [[K_IDX:x[0-9]+]], [[K:x[0-9]+]], #2
				; CHECK: sub [[K_1_IDX:x[0-9]+]], [[K_IDX]], #4
				; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, [[K_1_IDX]]
				; CHECK: add [[K]], [[K]], #1

				define void @test(i32 %M, i32* %P, i32* readonly %P2, i32* readonly %P3, i32* readonly %P4, i32* readonly %P5, i32* %P6, i32* readonly %P7, i32* readonly %P8){
				entry:
				%cmp69 = icmp slt i32 %M, 1
				br i1 %cmp69, label %for.cond.cleanup, label %for.body.preheader

				for.body.preheader: ; preds = %entry
				%0 = sext i32 %M to i64
				%1 = add i32 %M, 1
				%wide.trip.count = zext i32 %1 to i64
				br label %for.body

				for.cond.cleanup: ; preds = %for.inc, %entry
				ret void

				for.body: ; preds = %for.inc, %for.body.preheader
				%indvars.iv = phi i64 [ 1, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
				%2 = add nsw i64 %indvars.iv, -1
				%arrayidx = getelementptr inbounds i32, i32* %P, i64 %2
				%3 = load i32, i32* %arrayidx, align 4
				%arrayidx3 = getelementptr inbounds i32, i32* %P2, i64 %2
				%4 = load i32, i32* %arrayidx3, align 4
				%add = add nsw i32 %4, %3
				store i32 %add, i32* %arrayidx, align 4
				%arrayidx9 = getelementptr inbounds i32, i32* %P3, i64 %2
				%5 = load i32, i32* %arrayidx9, align 4
				%arrayidx12 = getelementptr inbounds i32, i32* %P4, i64 %2
				%6 = load i32, i32* %arrayidx12, align 4
				%add13 = add nsw i32 %6, %5
				store i32 %add13, i32* %arrayidx, align 4
				%arrayidx19 = getelementptr inbounds i32, i32* %P6, i64 %2
				%7 = load i32, i32* %arrayidx19, align 4
				%arrayidx22 = getelementptr inbounds i32, i32* %P7, i64 %2
				%8 = load i32, i32* %arrayidx22, align 4
				%add23 = add nsw i32 %8, %7
				store i32 %add23, i32* %arrayidx, align 4
				%cmp27 = icmp slt i64 %indvars.iv, %0
				br i1 %cmp27, label %if.then, label %for.inc

				if.then: ; preds = %for.body
				%arrayidx29 = getelementptr inbounds i32, i32* %P8, i64 %indvars.iv
				%9 = load i32, i32* %arrayidx29, align 4
				%arrayidx31 = getelementptr inbounds i32, i32* %P, i64 %indvars.iv
				%10 = load i32, i32* %arrayidx31, align 4
				%add32 = add nsw i32 %10, %9
				%arrayidx34 = getelementptr inbounds i32, i32* %P6, i64 %indvars.iv
				store i32 %add32, i32* %arrayidx34, align 4
				%arrayidx36 = getelementptr inbounds i32, i32* %P4, i64 %indvars.iv
				%11 = load i32, i32* %arrayidx36, align 4
				%arrayidx38 = getelementptr inbounds i32, i32* %P5, i64 %indvars.iv
				%12 = load i32, i32* %arrayidx38, align 4
				%add39 = add nsw i32 %12, %11
				store i32 %add39, i32* %arrayidx31, align 4
				br label %for.inc

				for.inc: ; preds = %for.body, %if.then
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}