Diff 21798

lib/Transforms/Scalar/LoopUnrollPass.cpp

Show First 20 Lines • Show All 179 Lines • ▼ Show 20 Lines	public:

// Select threshold values used to limit unrolling based on a		// Select threshold values used to limit unrolling based on a
// total unrolled size. Parameters Threshold and PartialThreshold		// total unrolled size. Parameters Threshold and PartialThreshold
// are set to the maximum unrolled size for fully and partially		// are set to the maximum unrolled size for fully and partially
// unrolled loops respectively.		// unrolled loops respectively.
void selectThresholds(const Loop *L, bool HasPragma,		void selectThresholds(const Loop *L, bool HasPragma,
const TargetTransformInfo::UnrollingPreferences &UP,		const TargetTransformInfo::UnrollingPreferences &UP,
unsigned &Threshold, unsigned &PartialThreshold,		unsigned &Threshold, unsigned &PartialThreshold,
unsigned NumberOfOptimizedInstructions) {		unsigned &AbsoluteThreshold,
		unsigned &PercentOfOptimizedForCompleteUnroll) {
// Determine the current unrolling threshold. While this is		// Determine the current unrolling threshold. While this is
// normally set from UnrollThreshold, it is overridden to a		// normally set from UnrollThreshold, it is overridden to a
// smaller value if the current function is marked as		// smaller value if the current function is marked as
// optimize-for-size, and the unroll threshold was not user		// optimize-for-size, and the unroll threshold was not user
// specified.		// specified.
Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;		Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
		PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold;
// If we are allowed to completely unroll if we can remove M% of		AbsoluteThreshold = UserAbsoluteThreshold ? CurrentAbsoluteThreshold
// instructions, and we know that with complete unrolling we'll be able
// to kill N instructions, then we can afford to completely unroll loops
// with unrolled size up to N*100/M.
// Adjust the threshold according to that:
unsigned PercentOfOptimizedForCompleteUnroll =
UserPercentOfOptimized ? CurrentMinPercentOfOptimized
: UP.MinPercentOfOptimized;
unsigned AbsoluteThreshold = UserAbsoluteThreshold
? CurrentAbsoluteThreshold
: UP.AbsoluteThreshold;		: UP.AbsoluteThreshold;
if (PercentOfOptimizedForCompleteUnroll)		PercentOfOptimizedForCompleteUnroll = UserPercentOfOptimized
Threshold = std::max<unsigned>(Threshold,		? CurrentMinPercentOfOptimized
NumberOfOptimizedInstructions * 100 /		: UP.MinPercentOfOptimized;
PercentOfOptimizedForCompleteUnroll);
// But don't allow unrolling loops bigger than absolute threshold.
Threshold = std::min<unsigned>(Threshold, AbsoluteThreshold);

PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold;
if (!UserThreshold &&		if (!UserThreshold &&
L->getHeader()->getParent()->hasFnAttribute(		L->getHeader()->getParent()->hasFnAttribute(
Attribute::OptimizeForSize)) {		Attribute::OptimizeForSize)) {
Threshold = UP.OptSizeThreshold;		Threshold = UP.OptSizeThreshold;
PartialThreshold = UP.PartialOptSizeThreshold;		PartialThreshold = UP.PartialOptSizeThreshold;
}		}
if (HasPragma) {		if (HasPragma) {
// If the loop has an unrolling pragma, we want to be more		// If the loop has an unrolling pragma, we want to be more
// aggressive with unrolling limits. Set thresholds to at		// aggressive with unrolling limits. Set thresholds to at
// least the PragmaTheshold value which is larger than the		// least the PragmaTheshold value which is larger than the
// default limits.		// default limits.
if (Threshold != NoThreshold)		if (Threshold != NoThreshold)
Threshold = std::max<unsigned>(Threshold, PragmaUnrollThreshold);		Threshold = std::max<unsigned>(Threshold, PragmaUnrollThreshold);
if (PartialThreshold != NoThreshold)		if (PartialThreshold != NoThreshold)
PartialThreshold =		PartialThreshold =
std::max<unsigned>(PartialThreshold, PragmaUnrollThreshold);		std::max<unsigned>(PartialThreshold, PragmaUnrollThreshold);
}		}
}		}
		bool canUnrollCompletely(Loop *L, unsigned Threshold,
		unsigned AbsoluteThreshold, unsigned UnrolledSize,
		unsigned NumberOfOptimizedInstructions,
		unsigned PercentOfOptimizedForCompleteUnroll);
};		};
}		}

char LoopUnroll::ID = 0;		char LoopUnroll::ID = 0;
INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)		INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)		INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)		INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)		INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)		INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSA)		INITIALIZE_PASS_DEPENDENCY(LCSSA)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)		INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)		INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)

Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,		Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
int Runtime) {		int Runtime) {
return new LoopUnroll(Threshold, Count, AllowPartial, Runtime);		return new LoopUnroll(Threshold, Count, AllowPartial, Runtime);
}		}

Pass *llvm::createSimpleLoopUnrollPass() {		Pass *llvm::createSimpleLoopUnrollPass() {
return llvm::createLoopUnrollPass(-1, -1, 0, 0);		return llvm::createLoopUnrollPass(-1, -1, 0, 0);
}		}

static bool isLoadFromConstantInitializer(Value *V) {		/// \brief SCEV expressions visitor used for finding expressions that would
if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))		/// become constants if the loop L is unrolled.
if (GV->isConstant() && GV->hasDefinitiveInitializer())
return GV->getInitializer();
return false;
}

struct FindConstantPointers {		struct FindConstantPointers {
bool LoadCanBeConstantFolded;		/// \brief Shows whether the expression is ConstAddress+Constant or not.
bool IndexIsConstant;		bool IndexIsConstant;
APInt Step;
APInt StartValue;		/// \brief Used for filtering out SCEV expressions with two or more AddRec
		///subexpressions.
		///
		/// haveSeenAR is used to filter out complicated SCEV expressions, having
		/// several AddRec sub-expressions. We don't handle them, because unrolling one
		/// loop wouldn't help to replace only one of these inductions with a constant,
		/// and consequently, the expression would remain non-constant.
		bool haveSeenAR;

		/// \brief If the SCEV expression becomes ConstAddress+Constant, this value
		/// holds ConstAddress. Otherwise, it's nullptr.
Value *BaseAddress;		Value *BaseAddress;

		/// \brief The loop, which we try to completely unroll.
const Loop *L;		const Loop *L;

ScalarEvolution &SE;		ScalarEvolution &SE;

FindConstantPointers(const Loop *loop, ScalarEvolution &SE)		FindConstantPointers(const Loop *loop, ScalarEvolution &SE)
: LoadCanBeConstantFolded(true), IndexIsConstant(true), L(loop), SE(SE) {}		: IndexIsConstant(true), haveSeenAR(false), BaseAddress(nullptr),
		L(loop), SE(SE) {}

bool follow(const SCEV *S) {		bool follow(const SCEV *S) {
if (const SCEVUnknown *SC = dyn_cast<SCEVUnknown>(S)) {		if (const SCEVUnknown *SC = dyn_cast<SCEVUnknown>(S)) {
// We've reached the leaf node of SCEV, it's most probably just a		// We've reached the leaf node of SCEV, it's most probably just a
// variable. Now it's time to see if it corresponds to a global constant		// variable.
// global (in which case we can eliminate the load), or not.		// If it's the only one SCEV-subexpression, then it might be a base
		// address of an index expression.
		// If we've already recorded base address, then just give up on this SCEV
		// - it's too complicated.
		if (BaseAddress)
		return IndexIsConstant = false;
BaseAddress = SC->getValue();		BaseAddress = SC->getValue();
LoadCanBeConstantFolded =
IndexIsConstant && isLoadFromConstantInitializer(BaseAddress);
return false;		return false;
}		}
if (isa<SCEVConstant>(S))		if (isa<SCEVConstant>(S))
return true;		return true;
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {		if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
// If the current SCEV expression is AddRec, and its loop isn't the loop		// If the current SCEV expression is AddRec, and its loop isn't the loop
// we are about to unroll, then we won't get a constant address after		// we are about to unroll, then we won't get a constant address after
// unrolling, and thus, won't be able to eliminate the load.		// unrolling, and thus, won't be able to eliminate the load.
if (AR->getLoop() != L)		if (AR->getLoop() != L)
return IndexIsConstant = false;		return IndexIsConstant = false;
// If the step isn't constant, we won't get constant addresses in unrolled
// version. Bail out.		// We don't handle multiple AddRecs here, so give up in this case.
if (const SCEVConstant *StepSE =		if (haveSeenAR)
dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
Step = StepSE->getValue()->getValue();
else
return IndexIsConstant = false;		return IndexIsConstant = false;

		haveSeenAR = true;

return IndexIsConstant;		return IndexIsConstant;
}		}
// If Result is true, continue traversal.		// If Result is true, continue traversal.
// Otherwise, we have found something that prevents us from (possible) load		// Otherwise, we have found something that prevents us from (possible) load
// elimination.		// elimination.
return IndexIsConstant;		return IndexIsConstant;
}		}
bool isDone() const { return !IndexIsConstant; }		bool isDone() const { return !IndexIsConstant; }
Show All 11 Lines
// v += b[i]*a[i];		// v += b[i]*a[i];
// If we completely unroll the loop, we would get:		// If we completely unroll the loop, we would get:
// v = b[0]a[0] + b[1]a[1] + b[2]*a[2]		// v = b[0]a[0] + b[1]a[1] + b[2]*a[2]
// Which then will be simplified to:		// Which then will be simplified to:
// v = b[0]* 0 + b[1]* 1 + b[2]* 0		// v = b[0]* 0 + b[1]* 1 + b[2]* 0
// And finally:		// And finally:
// v = b[1]		// v = b[1]
class UnrollAnalyzer : public InstVisitor<UnrollAnalyzer, bool> {		class UnrollAnalyzer : public InstVisitor<UnrollAnalyzer, bool> {
		typedef SetVector<BasicBlock , SmallVector<BasicBlock , 16>,
		SmallPtrSet<BasicBlock *, 16>> BBSetVector;

typedef InstVisitor<UnrollAnalyzer, bool> Base;		typedef InstVisitor<UnrollAnalyzer, bool> Base;
friend class InstVisitor<UnrollAnalyzer, bool>;		friend class InstVisitor<UnrollAnalyzer, bool>;

		typedef struct {
		Value *BaseAddr;
		APInt Start;
		APInt Step;
		} SCEVGEPDescriptor;

		/// \brief The loop we're going to analyze.
const Loop *L;		const Loop *L;

		/// \brief TripCount of the given loop.
unsigned TripCount;		unsigned TripCount;

ScalarEvolution &SE;		ScalarEvolution &SE;

const TargetTransformInfo &TTI;		const TargetTransformInfo &TTI;

		// While we walk the loop instructions, we we build up and maintain a mapping
		// of simplified values specific to this iteration. The idea is to propagate
		// any special information we have about loads that can be replaced with
		// constants after complete unrolling, and account for likely simplifications
		// post-unrolling.
DenseMap<Value , Constant > SimplifiedValues;		DenseMap<Value , Constant > SimplifiedValues;
DenseMap<LoadInst , Value > LoadBaseAddresses;
SmallPtrSet<Instruction *, 32> CountedInstructions;

/// \brief Count the number of optimized instructions.		// Similarly, we keep track of all instructions that become dead.
unsigned NumberOfOptimizedInstructions;		// We don't need to map them to a value, that's why we use Set instead of Map
		// here.
		SmallPtrSet<Instruction *, 16> DeadInstructions;

		// To avoid requesting SCEV info on every iteration, request it once, and
		// for each value that would become ConstAddress+Constant after loop
		// unrolling, save the corresponding data.
		DenseMap<Value *, SCEVGEPDescriptor> SCEVCache;

		/// \brief Number of currently simulated iteration.
		///
		/// If an expression is ConstAddress+Constant, then the Constant is
		/// Start + Iteration*Step, where Start and Step could be obtained from
		/// SCEVCache.
		unsigned Iteration;

		/// \brief Upper threshold for complete unrolling.
		unsigned MaxUnrolledLoopSize;

// Provide base case for our instruction visit.		// Provide base case for our instruction visit.
bool visitInstruction(Instruction &I) { return false; };		bool visitInstruction(Instruction &I) { return false; };
// TODO: We should also visit ICmp, FCmp, GetElementPtr, Trunc, ZExt, SExt,		// TODO: Add visitors for other instruction types, e.g. ZExt, SExt.
// FPTrunc, FPExt, FPToUI, FPToSI, UIToFP, SIToFP, BitCast, Select,
// ExtractElement, InsertElement, ShuffleVector, ExtractValue, InsertValue.
//		//
// Probaly it's worth to hoist the code for estimating the simplifications		// Probaly it's worth to hoist the code for estimating the simplifications
// effects to a separate class, since we have a very similar code in		// effects to a separate class, since we have a very similar code in
// InlineCost already.		// InlineCost already.
bool visitBinaryOperator(BinaryOperator &I) {		bool visitBinaryOperator(BinaryOperator &I) {
Value LHS = I.getOperand(0), RHS = I.getOperand(1);		Value LHS = I.getOperand(0), RHS = I.getOperand(1);
if (!isa<Constant>(LHS))		if (!isa<Constant>(LHS))
if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))		if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
LHS = SimpleLHS;		LHS = SimpleLHS;
if (!isa<Constant>(RHS))		if (!isa<Constant>(RHS))
if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))		if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
RHS = SimpleRHS;		RHS = SimpleRHS;
Value *SimpleV = nullptr;		Value *SimpleV = nullptr;
if (auto FI = dyn_cast<FPMathOperator>(&I))		if (auto FI = dyn_cast<FPMathOperator>(&I))
SimpleV =		SimpleV =
SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags());		SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags());
else		else
SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS);		SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS);

if (SimpleV && CountedInstructions.insert(&I).second)		if (SimpleV)
NumberOfOptimizedInstructions += TTI.getUserCost(&I);		NumberOfOptimizedInstructions += TTI.getUserCost(&I);

if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {		if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
SimplifiedValues[&I] = C;		SimplifiedValues[&I] = C;
return true;		return true;
}		}
return false;		return false;
}		}

Constant computeLoadValue(LoadInst LI, unsigned Iteration) {		bool visitCmpInst(CmpInst &I) {
if (!LI)		Value LHS = I.getOperand(0), RHS = I.getOperand(1);
return nullptr;		// First try to handle simplified comparisons.
Value *BaseAddr = LoadBaseAddresses[LI];		if (!isa<Constant>(LHS))
if (!BaseAddr)		if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
return nullptr;		LHS = SimpleLHS;
		if (!isa<Constant>(RHS))
		if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
		RHS = SimpleRHS;
		if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
		if (Constant *CRHS = dyn_cast<Constant>(RHS))
		if (Constant *C =
		ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
		NumberOfOptimizedInstructions += TTI.getUserCost(&I);
		SimplifiedValues[&I] = C;
		return true;
		}
		}

		if (I.getOpcode() == Instruction::FCmp)
		return false;

		return false;
		}

		bool visitLoad(LoadInst &I) {
		Value *AddrOp = I.getPointerOperand();
		if (SimplifiedValues[AddrOp])
		AddrOp = SimplifiedValues[AddrOp];
		if (!SCEVCache.count(AddrOp))
		return false;
		SCEVGEPDescriptor d = SCEVCache[AddrOp];

auto GV = dyn_cast<GlobalVariable>(BaseAddr);		auto GV = dyn_cast_or_null<GlobalVariable>(d.BaseAddr);
if (!GV)		if (!GV)
return nullptr;		return false;

ConstantDataSequential *CDS =		ConstantDataSequential *CDS =
dyn_cast<ConstantDataSequential>(GV->getInitializer());		dyn_cast<ConstantDataSequential>(GV->getInitializer());
if (!CDS)		if (!CDS)
return nullptr;		return false;

const SCEV *BaseAddrSE = SE.getSCEV(BaseAddr);
const SCEV *S = SE.getSCEV(LI->getPointerOperand());
const SCEV *OffSE = SE.getMinusSCEV(S, BaseAddrSE);

APInt StepC, StartC;
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffSE);
if (!AR)
return nullptr;

if (const SCEVConstant *StepSE =		unsigned Start = d.Start.getLimitedValue();
dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))		unsigned Step = d.Step.getLimitedValue();
StepC = StepSE->getValue()->getValue();
else
return nullptr;

if (const SCEVConstant *StartSE = dyn_cast<SCEVConstant>(AR->getStart()))
StartC = StartSE->getValue()->getValue();
else
return nullptr;

unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;		unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
unsigned Start = StartC.getLimitedValue();
unsigned Step = StepC.getLimitedValue();

unsigned Index = (Start + Step * Iteration) / ElemSize;		unsigned Index = (Start + Step * Iteration) / ElemSize;
if (Index >= CDS->getNumElements())		if (Index >= CDS->getNumElements())
return nullptr;		return false;

Constant *CV = CDS->getElementAsConstant(Index);		Constant *CV = CDS->getElementAsConstant(Index);
		if (CV)
		SimplifiedValues[cast<Value>(&I)] = CV;

return CV;		NumberOfOptimizedInstructions += TTI.getUserCost(&I);
		return true;
}		}

public:		// Check if this GEPs after complete loop unrolling would become a constant,
UnrollAnalyzer(const Loop *L, unsigned TripCount, ScalarEvolution &SE,		// or BaseAddress+Constant. If yes, save this BaseAddress and StartValue with
const TargetTransformInfo &TTI)		// StepValue for this GEP in the SCEVCache - that'll allow to evaluate the
: L(L), TripCount(TripCount), SE(SE), TTI(TTI),		// constant later.
NumberOfOptimizedInstructions(0) {}		// SCEV expression of such GEP should contain at most one AddRec expression,
		// and the loop corresponding to this expression should be L. The rest SCEV
		// sub-expressions should be either constants, or ScevUnknown (which would
		// become the base address). If the expression contains the base address,
		// then after subtracting it, we should get AddRec with constant step and
		// start.
		bool visitGetElementPtr(GetElementPtrInst &I) {
		Value *V = cast<Value>(&I);
		if (SCEVCache.count(V))
		return false;

// Visit all loads the loop L, and for those that, after complete loop		const SCEV *S = SE.getSCEV(V);
// unrolling, would have a constant address and it will point to a known
// constant initializer, record its base address for future use. It is used
// when we estimate number of potentially simplified instructions.
void findConstFoldableLoads() {
for (auto BB : L->getBlocks()) {
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
if (!LI->isSimple())
continue;
Value *AddrOp = LI->getPointerOperand();
const SCEV *S = SE.getSCEV(AddrOp);
FindConstantPointers Visitor(L, SE);		FindConstantPointers Visitor(L, SE);
SCEVTraversal<FindConstantPointers> T(Visitor);		SCEVTraversal<FindConstantPointers> T(Visitor);
		// Try to find (BaseAddress+Step+Offset) tuple.
		// If succeeded, save it to the cache - it might help in folding
		// loads.
T.visitAll(S);		T.visitAll(S);
if (Visitor.IndexIsConstant && Visitor.LoadCanBeConstantFolded) {		if (!Visitor.IndexIsConstant \|\| !Visitor.BaseAddress)
LoadBaseAddresses[LI] = Visitor.BaseAddress;		return false;
}
}		SCEVGEPDescriptor d;
}		d.BaseAddr = Visitor.BaseAddress;
}		const SCEV *BaseAddrSE = SE.getSCEV(d.BaseAddr);
		const SCEV *OffSE = SE.getMinusSCEV(S, BaseAddrSE);
		const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffSE);

		if (!AR)
		return false;

		const SCEVConstant *StepSE =
		dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE));
		const SCEVConstant *StartSE = dyn_cast<SCEVConstant>(AR->getStart());
		if (!StepSE \|\| !StartSE)
		return false;

		d.Step = StepSE->getValue()->getValue();
		d.Start = StartSE->getValue()->getValue();

		SCEVCache[V] = d;
		return false;
}		}

// Given a list of loads that could be constant-folded (LoadBaseAddresses),		public:
// estimate number of optimized instructions after substituting the concrete		UnrollAnalyzer(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
// values for the given Iteration. Also track how many instructions become		const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize)
// dead through this process.		: L(L), TripCount(TripCount), SE(SE), TTI(TTI),
unsigned estimateNumberOfOptimizedInstructions(unsigned Iteration) {		MaxUnrolledLoopSize(MaxUnrolledLoopSize) {}
// We keep a set vector for the worklist so that we don't wast space in the
// worklist queuing up the same instruction repeatedly. This can happen due		/// \brief Count the number of optimized instructions.
// to multiple operands being the same instruction or due to the same		unsigned NumberOfOptimizedInstructions;
// instruction being an operand of lots of things that end up dead or
		chandlercUnsubmitted Not Done Reply Inline Actions Just insert and continue here as well? (again, see below for why this might be ok) chandlerc: Just insert and continue here as well? (again, see below for why this might be ok)
		mzolotukhinAuthorUnsubmitted Not Done Reply Inline Actions Makes sense. It's a kind of leftover from inlining addBBSuccessors. mzolotukhin: Makes sense. It's a kind of leftover from inlining addBBSuccessors.
// simplified.		/// \brief Count the total number of instructions.
SmallSetVector<Instruction *, 8> Worklist;		unsigned UnrolledLoopSize;
		chandlercUnsubmitted Not Done Reply Inline Actions Handle this before we process the terminators? chandlerc: Handle this before we process the terminators?
		mzolotukhinAuthorUnsubmitted Not Done Reply Inline Actions Makes sense. mzolotukhin: Makes sense.
		mzolotukhinAuthorUnsubmitted Not Done Reply Inline Actions Actually, we can't do this before processing the terminators. It's possible that we find nothing in one block, but find a lot in others - that means we should only perform this check when all blocks are processed, i.e. at the end of an iteration processing. mzolotukhin: Actually, we can't do this before processing the terminators. It's possible that we find…

// Clear the simplified values and counts for this iteration.		// Complete loop unrolling can make some loads constant, and we need to know
		chandlercUnsubmitted Not Done Reply Inline Actions This comment seems out of date. chandlerc: This comment seems out of date.
		mzolotukhinAuthorUnsubmitted Not Done Reply Inline Actions It was just weirdly formatted:) mzolotukhin: It was just weirdly formatted:)
SimplifiedValues.clear();		// if that would expose any further optimization opportunities. This routine
CountedInstructions.clear();		// estimates this optimization. It assigns computed number of instructions,
		chandlercUnsubmitted Not Done Reply Inline Actions Rather than keeping a variable, just insert this onto the worklist an 'continue'? (see below for why this might be better) chandlerc: Rather than keeping a variable, just insert this onto the worklist an 'continue'? (see below…
		// that potentially might be optimized away, to NumberOfOptimizedInstructions,
		chandlercUnsubmitted Not Done Reply Inline Actions There is essentially no documentation of "EarlyExitFound" or why this behavior is correct. Generally, I'm worried about all the 'break' and loop control flow here. I think this is going to be a bit easier to understand if our early-stop points just return. chandlerc: There is essentially no documentation of "EarlyExitFound" or why this behavior is correct.
		mzolotukhinAuthorUnsubmitted Not Done Reply Inline Actions I will add more comments regarding EarlyExitFound flag. The logic behind it is the following: when at some point we managed to resolve conditional branch in some block, and this branch leads us out of the loop, we don't want to analyze further iterations - if we unroll the loop, they will become a dead code and will be removed. However, we do want to finish estimation of the current iteration - that's why we don't just return (otherwise, we'll miss the cost of blocks, remaining in the worklist). mzolotukhin: I will add more comments regarding EarlyExitFound flag. The logic behind it is the following…
		// and total number of instructions to UnrolledLoopSize (not counting blocks
		// that won't be reached, if we were able to compute the condition).
		chandlercUnsubmitted Not Done Reply Inline Actions Rather than declaring this in the iteration loop, please declare it in simulateLoop and clear it on each iteration so that we re-use allocated storage. We're very likely to put roughly the same number of BBs on the worklist each trip through. chandlerc: Rather than declaring this in the iteration loop, please declare it in simulateLoop and clear…
		mzolotukhinAuthorUnsubmitted Not Done Reply Inline Actions Good point, thanks. Fixed. mzolotukhin: Good point, thanks. Fixed.
		chandlercUnsubmitted Not Done Reply Inline Actions No need to handle this, just fall through two the successor loop. chandlerc: No need to handle this, just fall through two the successor loop.
		chandlercUnsubmitted Not Done Reply Inline Actions Ok, I see what you're trying to do here. You're trying to check for us definitely exiting the loop after N iterations due to branching to the exit block. But I don't think this code is correct at this point. While you know that if this basic block is dynamically executed on this iteration, the loop will be exited, you don't know that this block will be dynamically executed on this iteration. This only seems important to handle cases where simulating N iterations proves something about the trip count that SCEV can't prove, which on the whole seems like a bug in SCEV. Is it really worth trying to handle it here? If so, you'll want to check that this block is the only loop exiting block or otherwise ensure that it is dynamically executed on every iteration without fail. If you decide not to handle this, then that makes using 'continue' in my two comments above make sense. chandlerc: Ok, I see what you're trying to do here. You're trying to check for us definitely exiting the…
		mzolotukhinAuthorUnsubmitted Not Done Reply Inline Actions Oh, you are right. I incorrectly assumed that all blocks from worklist would be actually executed. I think I'll just drop this then to keep the code simpler. mzolotukhin: Oh, you are right. I incorrectly assumed that all blocks from worklist would be actually…
		void analyzeLoop() {
		BBSetVector BBWorklist;
		UnrolledLoopSize = 0;
NumberOfOptimizedInstructions = 0;		NumberOfOptimizedInstructions = 0;

// We start by adding all loads to the worklist.		// Don't simulate loops with a big or unknown tripcount
for (auto &LoadDescr : LoadBaseAddresses) {		if (!UnrollMaxIterationsCountToAnalyze \|\| !TripCount \|\|
LoadInst *LI = LoadDescr.first;		TripCount > UnrollMaxIterationsCountToAnalyze) {
SimplifiedValues[LI] = computeLoadValue(LI, Iteration);		UnrolledLoopSize = UINT_MAX;
if (CountedInstructions.insert(LI).second)		return;
NumberOfOptimizedInstructions += TTI.getUserCost(LI);		}
		chandlercUnsubmitted Not Done Reply Inline Actions Why not return? chandlerc: Why not return?
		mzolotukhinAuthorUnsubmitted Not Done Reply Inline Actions Fixed, thanks. mzolotukhin: Fixed, thanks.

for (User *U : LI->users())		// Simulate execution of each iteration of the loop counting instructions,
Worklist.insert(cast<Instruction>(U));		// which would be simplified.
}		// Since the same load will take different values on different iterations,
		chandlercUnsubmitted Not Done Reply Inline Actions I think it would be more clear to inline the analyzeBlock and addBBSuccessors here... At the least, inlining addBBSuccessors avoids passing around the worklist. chandlerc: I think it would be more clear to inline the analyzeBlock and addBBSuccessors here... At the…
		mzolotukhinAuthorUnsubmitted Not Done Reply Inline Actions Fixed, thanks. mzolotukhin: Fixed, thanks.
		// we literally have to go through all loop's iterations.
// And then we try to simplify every user of every instruction from the		for (Iteration = 0; Iteration < TripCount; ++Iteration) {
// worklist. If we do simplify a user, add it to the worklist to process		DeadInstructions.clear();
// its users as well.		BBWorklist.clear();
while (!Worklist.empty()) {		BBWorklist.insert(L->getHeader());
Instruction *I = Worklist.pop_back_val();		// Note that we must not cache the size, this loop grows the worklist.
if (!L->contains(I))		for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
		BasicBlock *BB = BBWorklist[Idx];
		if (BB->empty())
continue;		continue;
if (!visit(I))
		// Visit all instructions in the given basic block and try to simplify
		// it. We don't change the actual IR, just count optimization
		// opportunities.
		for (Instruction &I : *BB) {
		UnrolledLoopSize += TTI.getUserCost(&I);
		Base::visit(I);
		// If unrolled body turns out to be too big, bail out.
		if (UnrolledLoopSize - NumberOfOptimizedInstructions >
		MaxUnrolledLoopSize)
		return;
		}

		chandlercUnsubmitted Not Done Reply Inline Actions I find it a bit surprising to do this caching up-front. I would expect it to work better to lazily populate the cache based on the expressions we see. If that makes sense, I think it also makes sense to factor this into a completely separate change that just wraps up SCEV queries in a lazily populated cache, specifically for the purpose of simulation? Might make sense to fully sink this into SCEV, I don't know and would defer to others that know SCEV better. chandlerc: I find it a bit surprising to do this caching up-front. I would expect it to work better to…
		mzolotukhinAuthorUnsubmitted Not Done Reply Inline Actions Initially, I didn't want to mix working with SCEV with everything else, especially taking into account this data will be the same across all iterations and will be needed at every iteration. But now I'm thinking that moving this to visitGetElementPtr would simplify the code. I'll do that. As for the second part - SCEV is already a cache (i.e. Value --> SCEVExpression map ). But what we store in our cache is the result of some additional computations, which are very task-specific (we're looking for GEPs, whose SCEV expressions contain at most one SCEVAddRec and this AddRec relates to the particular loop). These computations seem expensive enough to cache their results, but they are not general enough to move that to SCEV itself. mzolotukhin: Initially, I didn't want to mix working with SCEV with everything else, especially taking into…
		// Add BB's successors to the worklist.
		// If it's possible to evaluate the condition, do that, and add only the
		// corresponding successor, dropping the rest.
		TerminatorInst *TI = BB->getTerminator();

		if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
		if (BI->isConditional()) {
		Value *Cond = BI->getCondition();
		if (ConstantInt *SimpleCond =
		dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
		BasicBlock *Succ = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
		if (L->contains(Succ))
		BBWorklist.insert(Succ);
continue;		continue;
for (User *U : I->users())		}
Worklist.insert(cast<Instruction>(U));		}
		} else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
		Value *Cond = SI->getCondition();
		if (ConstantInt *SimpleCond =
		dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
		BasicBlock *Succ = SI->findCaseValue(SimpleCond).getCaseSuccessor();
		if (L->contains(Succ))
		BBWorklist.insert(Succ);
		continue;
		}
}		}

// Now that we know the potentially simplifed instructions, estimate number		// We don't know, which successor would be taken, so we need to add
// of instructions that would become dead if we do perform the		// all of them to the worklist.
// simplification.		for (BasicBlock *Succ : successors(BB))
		if (L->contains(Succ))
// The dead instructions are held in a separate set. This is used to		BBWorklist.insert(Succ);
// prevent us from re-examining instructions and make sure we only count		}
// the benifit once. The worklist's internal set handles insertion
// deduplication.		// If we found no optimization opportunities on the first iteration, we
SmallPtrSet<Instruction *, 16> DeadInstructions;		// won't find them on later ones too.
		if (!NumberOfOptimizedInstructions) {
// Lambda to enque operands onto the worklist.		UnrolledLoopSize = UINT_MAX;
auto EnqueueOperands = [&](Instruction &I) {		return;
for (auto *Op : I.operand_values())		}
if (auto *OpI = dyn_cast<Instruction>(Op))
if (!OpI->use_empty())		for (unsigned Idx = BBWorklist.size() - 1; Idx != 0; --Idx) {
Worklist.insert(OpI);		BasicBlock *BB = BBWorklist[Idx];
};		if (BB->empty())

// Start by initializing worklist with simplified instructions.
for (auto &FoldedKeyValue : SimplifiedValues)
if (auto *FoldedInst = dyn_cast<Instruction>(FoldedKeyValue.first)) {
DeadInstructions.insert(FoldedInst);

// Add each instruction operand of this dead instruction to the
// worklist.
EnqueueOperands(*FoldedInst);
}

// If a definition of an insn is only used by simplified or dead
// instructions, it's also dead. Check defs of all instructions from the
// worklist.
while (!Worklist.empty()) {
Instruction *I = Worklist.pop_back_val();
if (!L->contains(I))
continue;		continue;
if (DeadInstructions.count(I))		for (BasicBlock::reverse_iterator I = BB->rbegin(), E = BB->rend(); I != E; ++I) {
		if (SimplifiedValues.count(&*I))
		continue;
		if (DeadInstructions.count(&*I))
continue;		continue;

if (std::all_of(I->user_begin(), I->user_end(), [&](User *U) {		if (std::all_of(I->user_begin(), I->user_end(), [&](User *U) {
return DeadInstructions.count(cast<Instruction>(U));		return SimplifiedValues.count(cast<Instruction>(U)) +
		DeadInstructions.count(cast<Instruction>(U));
})) {		})) {
NumberOfOptimizedInstructions += TTI.getUserCost(I);		NumberOfOptimizedInstructions += TTI.getUserCost(&*I);
DeadInstructions.insert(I);		DeadInstructions.insert(&*I);
EnqueueOperands(*I);		}
}		}
}		}
return NumberOfOptimizedInstructions;
}		}
};

// Complete loop unrolling can make some loads constant, and we need to know if
// that would expose any further optimization opportunities.
// This routine estimates this optimization effect and returns the number of
// instructions, that potentially might be optimized away.
static unsigned
approximateNumberOfOptimizedInstructions(const Loop *L, ScalarEvolution &SE,
unsigned TripCount,
const TargetTransformInfo &TTI) {
if (!TripCount \|\| !UnrollMaxIterationsCountToAnalyze)
return 0;

UnrollAnalyzer UA(L, TripCount, SE, TTI);
UA.findConstFoldableLoads();

// Estimate number of instructions, that could be simplified if we replace a
// load with the corresponding constant. Since the same load will take
// different values on different iterations, we have to go through all loop's
// iterations here. To limit ourselves here, we check only first N
// iterations, and then scale the found number, if necessary.
unsigned IterationsNumberForEstimate =
std::min<unsigned>(UnrollMaxIterationsCountToAnalyze, TripCount);
unsigned NumberOfOptimizedInstructions = 0;
for (unsigned i = 0; i < IterationsNumberForEstimate; ++i)
NumberOfOptimizedInstructions +=
UA.estimateNumberOfOptimizedInstructions(i);

NumberOfOptimizedInstructions *= TripCount / IterationsNumberForEstimate;

return NumberOfOptimizedInstructions;		// If we can overflow computing percentage of optimized instructions, just
		// give a conservative answer. Anyway, we don't want to deal with such a
		// big loops.
		if (NumberOfOptimizedInstructions > UINT_MAX / 100)
		NumberOfOptimizedInstructions = 0;
}		}
		};

/// ApproximateLoopSize - Approximate the size of the loop.		/// ApproximateLoopSize - Approximate the size of the loop.
static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,		static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
bool &NotDuplicatable,		bool &NotDuplicatable,
const TargetTransformInfo &TTI,		const TargetTransformInfo &TTI,
AssumptionCache *AC) {		AssumptionCache *AC) {
SmallPtrSet<const Value *, 32> EphValues;		SmallPtrSet<const Value *, 32> EphValues;
CodeMetrics::collectEphemeralValues(L, AC, EphValues);		CodeMetrics::collectEphemeralValues(L, AC, EphValues);
▲ Show 20 Lines • Show All 83 Lines • ▼ Show 20 Lines	static void SetLoopAlreadyUnrolled(Loop *L) {
MDs.push_back(DisableNode);		MDs.push_back(DisableNode);

MDNode *NewLoopID = MDNode::get(Context, MDs);		MDNode *NewLoopID = MDNode::get(Context, MDs);
// Set operand 0 to refer to the loop id itself.		// Set operand 0 to refer to the loop id itself.
NewLoopID->replaceOperandWith(0, NewLoopID);		NewLoopID->replaceOperandWith(0, NewLoopID);
L->setLoopID(NewLoopID);		L->setLoopID(NewLoopID);
}		}

		bool LoopUnroll::canUnrollCompletely(
		Loop *L, unsigned Threshold, unsigned AbsoluteThreshold,
		unsigned UnrolledSize, unsigned NumberOfOptimizedInstructions,
		unsigned PercentOfOptimizedForCompleteUnroll) {

		if (Threshold == NoThreshold) {
		DEBUG(dbgs() << " Can fully unroll, because no threshold is set.\n");
		return true;
		}

		if (UnrolledSize <= Threshold) {
		DEBUG(dbgs() << " Can fully unroll, because unrolled size: "
		<< UnrolledSize << "<" << Threshold << "\n");
		return true;
		}

		unsigned PercentOfOptimizedInstructions =
		NumberOfOptimizedInstructions * 100 /
		UnrolledSize; // The previous check guards us from div by 0
		if (UnrolledSize <= AbsoluteThreshold &&
		PercentOfOptimizedInstructions >= PercentOfOptimizedForCompleteUnroll) {
		DEBUG(dbgs() << " Can fully unroll, because unrolling will help removing "
		<< PercentOfOptimizedInstructions
		<< "% instructions (threshold: "
		<< PercentOfOptimizedForCompleteUnroll << "%)\n");
		DEBUG(dbgs() << " Unrolled size (" << UnrolledSize
		<< ") is less than the threshold (" << AbsoluteThreshold
		<< ").\n");
		return true;
		}

		DEBUG(dbgs() << " Too large to fully unroll:\n");
		DEBUG(dbgs() << " Unrolled size: " << UnrolledSize << "\n");
		DEBUG(dbgs() << " Estimated number of optimized instructions: "
		<< NumberOfOptimizedInstructions << "\n");
		DEBUG(dbgs() << " Absolute threshold: " << AbsoluteThreshold << "\n");
		DEBUG(dbgs() << " Minimum percent of removed instructions: "
		<< PercentOfOptimizedForCompleteUnroll << "\n");
		DEBUG(dbgs() << " Threshold for small loops: " << Threshold << "\n");
		return false;
		}

unsigned LoopUnroll::selectUnrollCount(		unsigned LoopUnroll::selectUnrollCount(
const Loop *L, unsigned TripCount, bool PragmaFullUnroll,		const Loop *L, unsigned TripCount, bool PragmaFullUnroll,
unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP,		unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP,
bool &SetExplicitly) {		bool &SetExplicitly) {
SetExplicitly = true;		SetExplicitly = true;

// User-specified count (either as a command-line option or		// User-specified count (either as a command-line option or
// constructor parameter) has highest precedence.		// constructor parameter) has highest precedence.
▲ Show 20 Lines • Show All 90 Lines • ▼ Show 20 Lines	DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
<< " instructions.\n");		<< " instructions.\n");
return false;		return false;
}		}
if (NumInlineCandidates != 0) {		if (NumInlineCandidates != 0) {
DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");		DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
return false;		return false;
}		}

unsigned NumberOfOptimizedInstructions =
approximateNumberOfOptimizedInstructions(L, *SE, TripCount, TTI);
DEBUG(dbgs() << " Complete unrolling could save: "
<< NumberOfOptimizedInstructions << "\n");

unsigned Threshold, PartialThreshold;		unsigned Threshold, PartialThreshold;
		unsigned AbsoluteThreshold, PercentOfOptimizedForCompleteUnroll;
selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold,		selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold,
NumberOfOptimizedInstructions);		AbsoluteThreshold, PercentOfOptimizedForCompleteUnroll);

// Given Count, TripCount and thresholds determine the type of		// Given Count, TripCount and thresholds determine the type of
// unrolling which is to be performed.		// unrolling which is to be performed.
enum { Full = 0, Partial = 1, Runtime = 2 };		enum { Full = 0, Partial = 1, Runtime = 2 };
int Unrolling;		int Unrolling;
if (TripCount && Count == TripCount) {		if (TripCount && Count == TripCount) {
if (Threshold != NoThreshold && UnrolledSize > Threshold) {		UnrollAnalyzer UA(L, TripCount, *SE, TTI, AbsoluteThreshold);
DEBUG(dbgs() << " Too large to fully unroll with count: " << Count		UA.analyzeLoop();
<< " because size: " << UnrolledSize << ">" << Threshold		if (canUnrollCompletely(
<< "\n");		L, Threshold, AbsoluteThreshold,
Unrolling = Partial;		std::min<unsigned>(UnrolledSize, UA.UnrolledLoopSize),
} else {		UA.NumberOfOptimizedInstructions,
		PercentOfOptimizedForCompleteUnroll)) {
Unrolling = Full;		Unrolling = Full;
		} else {
		Unrolling = Partial;
}		}
} else if (TripCount && Count < TripCount) {		} else if (TripCount && Count < TripCount) {
Unrolling = Partial;		Unrolling = Partial;
} else {		} else {
Unrolling = Runtime;		Unrolling = Runtime;
}		}

// Reduce count based on the type of unrolling and the threshold values.		// Reduce count based on the type of unrolling and the threshold values.
▲ Show 20 Lines • Show All 76 Lines • Show Last 20 Lines

test/Transforms/LoopUnroll/full-unroll-no-taken-branch.ll

This file was added.

				; In this test we check that the loop was unrolled.
				; This loop contains a branch that makes loop bigger than we can afford to
				; completely unroll. However, if we do unroll this loop, we can resolve this
				; branch and figure out that in fact we never take it. Thus, we can ignore its
				; cost.

				; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=20 -unroll-absolute-threshold=150 -unroll-threshold=150 -unroll-percent-of-optimized-for-complete-unroll=10 \| FileCheck %s
				; CHECK-NOT: icmp
				target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"

				@a = private unnamed_addr constant [20 x i32] [i32 12, i32 10, i32 32, i32 201, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9, i32 12, i32 1, i32 32, i32 10, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9]

				; Function Attrs: nounwind ssp uwtable
				define void @foo(i32* %b, i32* %c) {
				entry:
				br label %for.body

				for.body:
				%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.continue ]
				%arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* @a, i64 0, i64 %indvars.iv
				%a_i = load i32, i32* %arrayidx, align 4
				%cmp_expensive_branch = icmp eq i32 %a_i, 0
				br i1 %cmp_expensive_branch, label %expensive.branch, label %for.continue

				expensive.branch:
				%arrayidx_b = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
				%arrayidx_c = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
				%b_i = load i32, i32* %arrayidx_b, align 4
				%c_i = load i32, i32* %arrayidx_c, align 4
				%mul = mul nsw i32 %b_i, %c_i
				store i32 %mul, i32* %arrayidx_c, align 4
				br label %for.continue

				for.continue:
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond = icmp ne i64 %indvars.iv.next, 20
				br i1 %exitcond, label %for.body, label %for.end

				for.end:
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

Reimplement heuristic for estimating complete-unroll optimization effects.
AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 21798

lib/Transforms/Scalar/LoopUnrollPass.cpp

test/Transforms/LoopUnroll/full-unroll-no-taken-branch.ll

This is an archive of the discontinued LLVM Phabricator instance.

Reimplement heuristic for estimating complete-unroll optimization effects.AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 21798

lib/Transforms/Scalar/LoopUnrollPass.cpp

test/Transforms/LoopUnroll/full-unroll-no-taken-branch.ll

Reimplement heuristic for estimating complete-unroll optimization effects.
AbandonedPublic