This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
include/llvm/Analysis/
-
llvm/
-
Analysis/
-
LoopPassManager.h
-
lib/
-
Passes/
-
PassBuilder.cpp
-
Transforms/Scalar/
-
Scalar/
2/4
LoopUnrollPass.cpp
-
test/
-
Other/
2
pass-pipelines.ll
-
Transforms/LoopUnroll/
-
LoopUnroll/
-
unloop.ll
2/2
unroll-heuristics-pgo.ll

Differential D26527

Use profile info to adjust loop unroll threshold.
ClosedPublic

Authored by danielcdh on Nov 10 2016, 2:37 PM.

Download Raw Diff

Details

Reviewers

mzolotukhin
davidxl

Commits

rG41d72a863260: Use profile info to adjust loop unroll threshold.
rL287186: Use profile info to adjust loop unroll threshold.

Summary

For flat loop, even if it is hot, it is not a good idea to unroll in runtime, thus we set a lower partial unroll threshold.
For hot loop, we set a higher unroll threshold and allows expensive tripcount computation to allow more aggressive unrolling.

Diff Detail

Build Status

Buildable 1172
Build 1172: arc lint + arc unit

Event Timeline

danielcdh updated this revision to Diff 77556.Nov 10 2016, 2:37 PM

danielcdh retitled this revision from to Use profile info to adjust loop unroll threshold..

danielcdh updated this object.

danielcdh added reviewers: davidxl, mzolotukhin.

danielcdh added a subscriber: llvm-commits.

Herald added subscribers: mehdi_amini, sanjoy. · View Herald TranscriptNov 10 2016, 2:37 PM

Hi,

Thanks for working on this! Please find some comments inline.

Michael

lib/Transforms/Scalar/LoopUnrollPass.cpp
765	This looks like a magic number to me. Can we use some parameter for it (or maybe separate thresholds for 'hot' and 'cold' loops)?
test/Other/pass-pipelines.ll
49–50	Hmm, is loop-unroll in a separate instance of loop pass manager now?

Update the patch to remove dependency to BFI/PSI and only use trip count to evaluate if we want to unroll the loop.

Also steal Micheal's getLoopEstimatedTripCount implementation.

lib/Transforms/Scalar/LoopUnrollPass.cpp
765	Logic removed from the patch
test/Other/pass-pipelines.ll
49–50	Removed dependency to BFI/PSI

The change looks good to me, thank you! I'm assuming you and Michael will figure out which version of getLoopEstimatedTripCount you want to use, other than that I have mostly nitpicky comments below.

BTW, do you have performance testing results for this patch? I'd expect some improvements in code-size and compile-time with these changes.

Michael

lib/Transforms/Scalar/LoopUnrollPass.cpp
760	Please add some comment here.
761–762	`if (auto ProfileTripCount = getLoopEstimatedTripCount(L))` ?
lib/Transforms/Utils/LoopUtils.cpp
1071 ↗	(On Diff #78260)	This version and the one from D25963 should eventually become the same, right?
1077–1078 ↗	(On Diff #78260)	Probably we also need to check that the latch is exiting (i.e. the branch is conditional).
test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
6	Please add `@` to the name.
7	Is it enough to just check presence of the prologue? Maybe explicitly check that we have several copies of some instruction?

This revision is now accepted and ready to land.Nov 16 2016, 2:24 PM

update

lib/Transforms/Utils/LoopUtils.cpp
1071 ↗	(On Diff #78260)	Yes, I stole the code from D25963 ;-)

The perf/size impact of this patch is small on speccpu as flat loop is rare in most of the benchmarks.

spec/2006/fp/C++/444.namd 25.47 +0.30%
spec/2006/fp/C++/447.dealII 45.46 +0.23%
spec/2006/fp/C++/450.soplex 43.38 +0.58%
spec/2006/fp/C++/453.povray 37.88 -0.78%
spec/2006/fp/C/433.milc 23.75 -0.13%
spec/2006/fp/C/470.lbm 41.53 -0.09%
spec/2006/fp/C/482.sphinx3 48.97 -0.11%
spec/2006/int/C++/471.omnetpp 22.79 -0.22%
spec/2006/int/C++/473.astar 22.99 +0.17%
spec/2006/int/C++/483.xalancbmk 38.55 -0.42%
spec/2006/int/C/400.perlbench 37.06 +1.14%
spec/2006/int/C/401.bzip2 23.38 +0.98%
spec/2006/int/C/403.gcc 34.52 -0.25%
spec/2006/int/C/429.mcf 42.28 -0.05%
spec/2006/int/C/445.gobmk 27.98 +0.60%
spec/2006/int/C/456.hmmer 26.01 -0.06%
spec/2006/int/C/458.sjeng 30.42 +0.50%
spec/2006/int/C/462.libquantum 57.48 +0.37%
spec/2006/int/C/464.h264ref 47.6 -0.70%

geometric mean +0.11%

danielcdh closed this revision.Nov 16 2016, 5:26 PM

Revision Contents

Path

Size

include/

llvm/

Analysis/

LoopPassManager.h

3 lines

lib/

Passes/

PassBuilder.cpp

1 line

Transforms/

Scalar/

LoopUnrollPass.cpp

66 lines

test/

Other/

pass-pipelines.ll

2 lines

Transforms/

LoopUnroll/

unloop.ll

2 lines

unroll-heuristics-pgo.ll

142 lines

Diff 77556

include/llvm/Analysis/LoopPassManager.h

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	typedef InnerAnalysisManagerProxy<LoopAnalysisManager, Function>			typedef InnerAnalysisManagerProxy<LoopAnalysisManager, Function>
	LoopAnalysisManagerFunctionProxy;			LoopAnalysisManagerFunctionProxy;

	extern template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>;			extern template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>;
	/// A proxy from a \c FunctionAnalysisManager to a \c Loop.			/// A proxy from a \c FunctionAnalysisManager to a \c Loop.
	typedef OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>			typedef OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>
	FunctionAnalysisManagerLoopProxy;			FunctionAnalysisManagerLoopProxy;

				typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, Loop>
				ModuleAnalysisManagerLoopProxy;

	/// Returns the minimum set of Analyses that all loop passes must preserve.			/// Returns the minimum set of Analyses that all loop passes must preserve.
	PreservedAnalyses getLoopPassPreservedAnalyses();			PreservedAnalyses getLoopPassPreservedAnalyses();

	/// \brief Adaptor that maps from a function to its loops.			/// \brief Adaptor that maps from a function to its loops.
	///			///
	/// Designed to allow composition of a LoopPass(Manager) and a			/// Designed to allow composition of a LoopPass(Manager) and a
	/// FunctionPassManager. Note that if this pass is constructed with a \c			/// FunctionPassManager. Note that if this pass is constructed with a \c
	/// FunctionAnalysisManager it will run the \c LoopAnalysisManagerFunctionProxy			/// FunctionAnalysisManager it will run the \c LoopAnalysisManagerFunctionProxy
	▲ Show 20 Lines • Show All 69 Lines • Show Last 20 Lines

lib/Passes/PassBuilder.cpp

Show First 20 Lines • Show All 768 Lines • ▼ Show 20 Lines	void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); });		MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); });
MAM.registerPass([&] { return CGSCCAnalysisManagerModuleProxy(CGAM); });		MAM.registerPass([&] { return CGSCCAnalysisManagerModuleProxy(CGAM); });
CGAM.registerPass([&] { return FunctionAnalysisManagerCGSCCProxy(FAM); });		CGAM.registerPass([&] { return FunctionAnalysisManagerCGSCCProxy(FAM); });
CGAM.registerPass([&] { return ModuleAnalysisManagerCGSCCProxy(MAM); });		CGAM.registerPass([&] { return ModuleAnalysisManagerCGSCCProxy(MAM); });
FAM.registerPass([&] { return CGSCCAnalysisManagerFunctionProxy(CGAM); });		FAM.registerPass([&] { return CGSCCAnalysisManagerFunctionProxy(CGAM); });
FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });		FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });
FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); });		FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); });
LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });		LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
		LAM.registerPass([&] { return ModuleAnalysisManagerLoopProxy(MAM); });
}		}

bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,		bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
ArrayRef<PipelineElement> Pipeline,		ArrayRef<PipelineElement> Pipeline,
bool VerifyEachPass,		bool VerifyEachPass,
bool DebugLogging) {		bool DebugLogging) {
for (const auto &Element : Pipeline) {		for (const auto &Element : Pipeline) {
if (!parseModulePass(MPM, Element, VerifyEachPass, DebugLogging))		if (!parseModulePass(MPM, Element, VerifyEachPass, DebugLogging))
▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

lib/Transforms/Scalar/LoopUnrollPass.cpp

Show All 9 Lines
// This pass implements a simple loop unroller. It works best when loops have		// This pass implements a simple loop unroller. It works best when loops have
// been canonicalized by the -indvars pass, allowing it to determine the trip		// been canonicalized by the -indvars pass, allowing it to determine the trip
// counts of loops easily.		// counts of loops easily.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Scalar/LoopUnrollPass.h"		#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
#include "llvm/ADT/SetVector.h"		#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/AssumptionCache.h"		#include "llvm/Analysis/AssumptionCache.h"
		#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"		#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/GlobalsModRef.h"		#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"		#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopPass.h"		#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/LoopPassManager.h"		#include "llvm/Analysis/LoopPassManager.h"
#include "llvm/Analysis/LoopUnrollAnalyzer.h"		#include "llvm/Analysis/LoopUnrollAnalyzer.h"
#include "llvm/Analysis/OptimizationDiagnosticInfo.h"		#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
		#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"		#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"		#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"		#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/DataLayout.h"		#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"		#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstVisitor.h"		#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Metadata.h"		#include "llvm/IR/Metadata.h"
▲ Show 20 Lines • Show All 64 Lines • ▼ Show 20 Lines	static cl::opt<unsigned> UnrollMaxUpperBound(
cl::desc(		cl::desc(
"The max of trip count upper bound that is considered in unrolling"));		"The max of trip count upper bound that is considered in unrolling"));

static cl::opt<unsigned> PragmaUnrollThreshold(		static cl::opt<unsigned> PragmaUnrollThreshold(
"pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,		"pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
cl::desc("Unrolled size limit for loops with an unroll(full) or "		cl::desc("Unrolled size limit for loops with an unroll(full) or "
"unroll_count pragma."));		"unroll_count pragma."));

		static cl::opt<unsigned> FlatLoopTripCountThreshold(
		"flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
		cl::desc("If the runtime tripcount for the loop is lower than the "
		"threshold, the loop is considered as flat and will be less "
		"aggressively unrolled."));

/// A magic value for use with the Threshold parameter to indicate		/// A magic value for use with the Threshold parameter to indicate
/// that the loop unroll should be performed regardless of how much		/// that the loop unroll should be performed regardless of how much
/// code expansion would result.		/// code expansion would result.
static const unsigned NoThreshold = UINT_MAX;		static const unsigned NoThreshold = UINT_MAX;

/// Gather the various unrolling parameters based on the defaults, compiler		/// Gather the various unrolling parameters based on the defaults, compiler
/// flags, TTI overrides and user specified parameters.		/// flags, TTI overrides and user specified parameters.
static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(		static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
▲ Show 20 Lines • Show All 594 Lines • ▼ Show 20 Lines	static uint64_t getUnrolledLoopSize(
assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");		assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;		return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
}		}

// Returns true if unroll count was set explicitly.		// Returns true if unroll count was set explicitly.
// Calculates unroll count and writes it to UP.Count.		// Calculates unroll count and writes it to UP.Count.
static bool computeUnrollCount(		static bool computeUnrollCount(
Loop L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo LI,		Loop L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo LI,
ScalarEvolution SE, OptimizationRemarkEmitter ORE, unsigned &TripCount,		ScalarEvolution SE, BlockFrequencyInfo BFI, ProfileSummaryInfo *PSI,
unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,		OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
		unsigned &TripMultiple, unsigned LoopSize,
TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {		TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
// Check for explicit Count.		// Check for explicit Count.
// 1st priority is unroll count set by "unroll-count" option.		// 1st priority is unroll count set by "unroll-count" option.
bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;		bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
if (UserUnrollCount) {		if (UserUnrollCount) {
UP.Count = UnrollCount;		UP.Count = UnrollCount;
UP.AllowExpensiveTripCount = true;		UP.AllowExpensiveTripCount = true;
UP.Force = true;		UP.Force = true;
Show All 18 Lines	if (PragmaFullUnroll && TripCount != 0) {
if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)		if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
return false;		return false;
}		}

bool PragmaEnableUnroll = HasUnrollEnablePragma(L);		bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
bool ExplicitUnroll = PragmaCount > 0 \|\| PragmaFullUnroll \|\|		bool ExplicitUnroll = PragmaCount > 0 \|\| PragmaFullUnroll \|\|
PragmaEnableUnroll \|\| UserUnrollCount;		PragmaEnableUnroll \|\| UserUnrollCount;

		if (L->getHeader()->getParent()->getEntryCount()) {
		mzolotukhinUnsubmitted Done Reply Inline Actions Please add some comment here. mzolotukhin: Please add some comment here.
		if (TripCount == 0 &&
		BFI->getBlockFreq(L->getHeader()) <
		mzolotukhinUnsubmitted Done Reply Inline Actions `if (auto ProfileTripCount = getLoopEstimatedTripCount(L))` ? mzolotukhin: `if (auto ProfileTripCount = getLoopEstimatedTripCount(L))` ?
		FlatLoopTripCountThreshold *
		BFI->getBlockFreq(L->getLoopPreheader()).getFrequency()) {
		UP.PartialThreshold /= 4;
		mzolotukhinUnsubmitted Not Done Reply Inline Actions This looks like a magic number to me. Can we use some parameter for it (or maybe separate thresholds for 'hot' and 'cold' loops)? mzolotukhin: This looks like a magic number to me. Can we use some parameter for it (or maybe separate…
		danielcdhAuthorUnsubmitted Not Done Reply Inline Actions Logic removed from the patch danielcdh: Logic removed from the patch
		} else if (PSI->isHotBB(L->getHeader(), BFI)) {
		UP.Threshold *= 4;
		UP.PartialThreshold *= 4;
		UP.AllowExpensiveTripCount = true;
		}
		}

if (ExplicitUnroll && TripCount != 0) {		if (ExplicitUnroll && TripCount != 0) {
// If the loop has an unrolling pragma, we want to be more aggressive with		// If the loop has an unrolling pragma, we want to be more aggressive with
// unrolling limits. Set thresholds to at least the PragmaThreshold value		// unrolling limits. Set thresholds to at least the PragmaThreshold value
// which is larger than the default limits.		// which is larger than the default limits.
UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);		UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
UP.PartialThreshold =		UP.PartialThreshold =
std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);		std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
}		}
▲ Show 20 Lines • Show All 159 Lines • ▼ Show 20 Lines	#endif
DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n");		DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n");
if (UP.Count < 2)		if (UP.Count < 2)
UP.Count = 0;		UP.Count = 0;
return ExplicitUnroll;		return ExplicitUnroll;
}		}

static bool tryToUnrollLoop(Loop L, DominatorTree &DT, LoopInfo LI,		static bool tryToUnrollLoop(Loop L, DominatorTree &DT, LoopInfo LI,
ScalarEvolution *SE, const TargetTransformInfo &TTI,		ScalarEvolution *SE, const TargetTransformInfo &TTI,
AssumptionCache &AC, OptimizationRemarkEmitter &ORE,		AssumptionCache &AC, BlockFrequencyInfo *BFI,
bool PreserveLCSSA,		ProfileSummaryInfo *PSI,
		OptimizationRemarkEmitter &ORE, bool PreserveLCSSA,
Optional<unsigned> ProvidedCount,		Optional<unsigned> ProvidedCount,
Optional<unsigned> ProvidedThreshold,		Optional<unsigned> ProvidedThreshold,
Optional<bool> ProvidedAllowPartial,		Optional<bool> ProvidedAllowPartial,
Optional<bool> ProvidedRuntime,		Optional<bool> ProvidedRuntime,
Optional<bool> ProvidedUpperBound) {		Optional<bool> ProvidedUpperBound) {
DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()		DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
<< "] Loop %" << L->getHeader()->getName() << "\n");		<< "] Loop %" << L->getHeader()->getName() << "\n");
if (HasUnrollDisablePragma(L)) {		if (HasUnrollDisablePragma(L)) {
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	if (!TripCount) {
if (!(UP.UpperBound \|\| MaxOrZero) \|\| MaxTripCount > UnrollMaxUpperBound) {		if (!(UP.UpperBound \|\| MaxOrZero) \|\| MaxTripCount > UnrollMaxUpperBound) {
MaxTripCount = 0;		MaxTripCount = 0;
}		}
}		}

// computeUnrollCount() decides whether it is beneficial to use upper bound to		// computeUnrollCount() decides whether it is beneficial to use upper bound to
// fully unroll the loop.		// fully unroll the loop.
bool UseUpperBound = false;		bool UseUpperBound = false;
bool IsCountSetExplicitly =		bool IsCountSetExplicitly = computeUnrollCount(
computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,		L, TTI, DT, LI, SE, BFI, PSI, &ORE, TripCount, MaxTripCount, TripMultiple,
TripMultiple, LoopSize, UP, UseUpperBound);		LoopSize, UP, UseUpperBound);
if (!UP.Count)		if (!UP.Count)
return false;		return false;
// Unroll factor (Count) must be less or equal to TripCount.		// Unroll factor (Count) must be less or equal to TripCount.
if (TripCount && UP.Count > TripCount)		if (TripCount && UP.Count > TripCount)
UP.Count = TripCount;		UP.Count = TripCount;

// Unroll the loop.		// Unroll the loop.
if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,		if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
Show All 32 Lines	bool runOnLoop(Loop *L, LPPassManager &) override {
if (skipLoop(L))		if (skipLoop(L))
return false;		return false;

Function &F = *L->getHeader()->getParent();		Function &F = *L->getHeader()->getParent();

auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();		auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();		LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();		ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
		BlockFrequencyInfo *BFI =
		&getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
		ProfileSummaryInfo *PSI =
		getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
const TargetTransformInfo &TTI =		const TargetTransformInfo &TTI =
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);		getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);		auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
// For the old PM, we can't use OptimizationRemarkEmitter as an analysis		// For the old PM, we can't use OptimizationRemarkEmitter as an analysis
// pass. Function analyses need to be preserved across loop transformations		// pass. Function analyses need to be preserved across loop transformations
// but ORE cannot be preserved (see comment before the pass definition).		// but ORE cannot be preserved (see comment before the pass definition).
OptimizationRemarkEmitter ORE(&F);		OptimizationRemarkEmitter ORE(&F);
bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);		bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);

return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA,		return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, BFI, PSI, ORE, PreserveLCSSA,
ProvidedCount, ProvidedThreshold,		ProvidedCount, ProvidedThreshold,
ProvidedAllowPartial, ProvidedRuntime,		ProvidedAllowPartial, ProvidedRuntime,
ProvidedUpperBound);		ProvidedUpperBound);
}		}

/// This transformation requires natural loop information & requires that		/// This transformation requires natural loop information & requires that
/// loop preheaders be inserted into the CFG...		/// loop preheaders be inserted into the CFG...
///		///
void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();		AU.addRequired<AssumptionCacheTracker>();
		AU.addRequired<BlockFrequencyInfoWrapperPass>();
		AU.addRequired<ProfileSummaryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();		AU.addRequired<TargetTransformInfoWrapperPass>();
// FIXME: Loop passes are required to preserve domtree, and for now we just		// FIXME: Loop passes are required to preserve domtree, and for now we just
// recreate dom info if anything gets unrolled.		// recreate dom info if anything gets unrolled.
getLoopAnalysisUsage(AU);		getLoopAnalysisUsage(AU);
}		}
};		};
}		}

char LoopUnroll::ID = 0;		char LoopUnroll::ID = 0;
INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)		INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)		INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
		INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
		INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopPass)		INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)		INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)		INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)

Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,		Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
int Runtime, int UpperBound) {		int Runtime, int UpperBound) {
// TODO: It would make more sense for this function to take the optionals		// TODO: It would make more sense for this function to take the optionals
// directly, but that's dangerous since it would silently break out of tree		// directly, but that's dangerous since it would silently break out of tree
Show All 9 Lines
Pass *llvm::createSimpleLoopUnrollPass() {		Pass *llvm::createSimpleLoopUnrollPass() {
return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);		return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);
}		}

PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM) {		PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM) {
const auto &FAM =		const auto &FAM =
AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();		AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
Function *F = L.getHeader()->getParent();		Function *F = L.getHeader()->getParent();
		const auto &MAM =
		AM.getResult<ModuleAnalysisManagerLoopProxy>(L).getManager();

DominatorTree DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);		DominatorTree DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
LoopInfo LI = FAM.getCachedResult<LoopAnalysis>(F);		LoopInfo LI = FAM.getCachedResult<LoopAnalysis>(F);
ScalarEvolution SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(F);		ScalarEvolution SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(F);
auto TTI = FAM.getCachedResult<TargetIRAnalysis>(F);		auto TTI = FAM.getCachedResult<TargetIRAnalysis>(F);
auto AC = FAM.getCachedResult<AssumptionAnalysis>(F);		auto AC = FAM.getCachedResult<AssumptionAnalysis>(F);
		auto BFI = FAM.getCachedResult<BlockFrequencyAnalysis>(F);
		auto PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(F->getParent());
auto ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(F);		auto ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(F);
if (!DT)		if (!DT)
report_fatal_error(		report_fatal_error(
"LoopUnrollPass: DominatorTreeAnalysis not cached at a higher level");		"LoopUnrollPass: DominatorTreeAnalysis not cached at a higher level");
if (!LI)		if (!LI)
report_fatal_error(		report_fatal_error(
"LoopUnrollPass: LoopAnalysis not cached at a higher level");		"LoopUnrollPass: LoopAnalysis not cached at a higher level");
if (!SE)		if (!SE)
report_fatal_error(		report_fatal_error(
"LoopUnrollPass: ScalarEvolutionAnalysis not cached at a higher level");		"LoopUnrollPass: ScalarEvolutionAnalysis not cached at a higher level");
if (!TTI)		if (!TTI)
report_fatal_error(		report_fatal_error(
"LoopUnrollPass: TargetIRAnalysis not cached at a higher level");		"LoopUnrollPass: TargetIRAnalysis not cached at a higher level");
if (!AC)		if (!AC)
report_fatal_error(		report_fatal_error(
"LoopUnrollPass: AssumptionAnalysis not cached at a higher level");		"LoopUnrollPass: AssumptionAnalysis not cached at a higher level");
		if (!BFI)
		report_fatal_error(
		"LoopUnrollPass: BlockFrequencyAnalysis not cached at a higher level");
		if (!PSI)
		report_fatal_error(
		"LoopUnrollPass: ProfileSummaryAnalysis not cached at a higher level");
if (!ORE)		if (!ORE)
report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "		report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
"cached at a higher level");		"cached at a higher level");

bool Changed =		bool Changed = tryToUnrollLoop(&L, DT, LI, SE, TTI, *AC, BFI, PSI,
tryToUnrollLoop(&L, DT, LI, SE, TTI, AC, ORE, /PreserveLCSSA/ true,		ORE, /PreserveLCSSA*/ true, ProvidedCount,
ProvidedCount, ProvidedThreshold, ProvidedAllowPartial,		ProvidedThreshold, ProvidedAllowPartial,
ProvidedRuntime, ProvidedUpperBound);		ProvidedRuntime, ProvidedUpperBound);

if (!Changed)		if (!Changed)
return PreservedAnalyses::all();		return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();		return getLoopPassPreservedAnalyses();
}		}

test/Other/pass-pipelines.ll

	Show All 40 Lines
	; FIXME: We shouldn't be pulling out to simplify-cfg and instcombine and			; FIXME: We shouldn't be pulling out to simplify-cfg and instcombine and
	; causing new loop pass managers.			; causing new loop pass managers.
	; CHECK-O2: Simplify the CFG			; CHECK-O2: Simplify the CFG
	; CHECK-O2-NOT: Manager			; CHECK-O2-NOT: Manager
	; CHECK-O2: Combine redundant instructions			; CHECK-O2: Combine redundant instructions
	; CHECK-O2-NOT: Manager			; CHECK-O2-NOT: Manager
	; CHECK-O2: Loop Pass Manager			; CHECK-O2: Loop Pass Manager
	; CHECK-O2-NOT: Manager			; CHECK-O2-NOT: Manager
				; CHECK-O2: Loop Pass Manager
				; CHECK-O2-NEXT: Unroll loops
				mzolotukhinUnsubmitted Not Done Reply Inline Actions Hmm, is loop-unroll in a separate instance of loop pass manager now? mzolotukhin: Hmm, is loop-unroll in a separate instance of loop pass manager now?
				danielcdhAuthorUnsubmitted Not Done Reply Inline Actions Removed dependency to BFI/PSI danielcdh: Removed dependency to BFI/PSI
	; FIXME: It isn't clear that we need yet another loop pass pipeline			; FIXME: It isn't clear that we need yet another loop pass pipeline
	; and run of LICM here.			; and run of LICM here.
	; CHECK-O2-NOT: Manager			; CHECK-O2-NOT: Manager
	; CHECK-O2: Loop Pass Manager			; CHECK-O2: Loop Pass Manager
	; CHECK-O2-NEXT: Loop Invariant Code Motion			; CHECK-O2-NEXT: Loop Invariant Code Motion
	; CHECK-O2-NOT: Manager			; CHECK-O2-NOT: Manager
	; Next we break out of the main Function passes inside the CGSCC pipeline with			; Next we break out of the main Function passes inside the CGSCC pipeline with
	; a barrier pass.			; a barrier pass.
	▲ Show 20 Lines • Show All 42 Lines • Show Last 20 Lines

test/Transforms/LoopUnroll/unloop.ll

	; RUN: opt < %s -S -loop-unroll -verify-loop-info \| FileCheck %s			; RUN: opt < %s -S -loop-unroll -verify-loop-info \| FileCheck %s
	; RUN: opt < %s -S -passes='function(require<scalar-evolution>,require<targetir>,require<opt-remark-emit>,loop(unroll),verify<loops>)' \| FileCheck %s			; RUN: opt < %s -S -passes='module(require<profile-summary>,function(require<scalar-evolution>,require<targetir>,require<opt-remark-emit>,require<block-freq>,loop(unroll),verify<loops>))' \| FileCheck %s
	;			;
	; Unit tests for LoopInfo::markAsRemoved.			; Unit tests for LoopInfo::markAsRemoved.

	declare i1 @check() nounwind			declare i1 @check() nounwind

	; Ensure that tail->inner is removed and rely on verify-loopinfo to			; Ensure that tail->inner is removed and rely on verify-loopinfo to
	; check soundness.			; check soundness.
	;			;
	▲ Show 20 Lines • Show All 463 Lines • Show Last 20 Lines

test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll

This file was added.

				; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-threshold=10 -unroll-dynamic-cost-savings-discount=0 \| FileCheck %s

				@known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16

				; CHECK-LABEL: foo
				; CHECK: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
				mzolotukhinUnsubmitted Done Reply Inline Actions Please add `@` to the name. mzolotukhin: Please add `@` to the name.
				define i32 @foo(i32* noalias nocapture readonly %src) {
				mzolotukhinUnsubmitted Done Reply Inline Actions Is it enough to just check presence of the prologue? Maybe explicitly check that we have several copies of some instruction? mzolotukhin: Is it enough to just check presence of the prologue? Maybe explicitly check that we have…
				entry:
				br label %loop

				loop:
				%iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
				%r = phi i32 [ 0, %entry ], [ %add, %loop ]
				%arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
				%src_element = load i32, i32* %arrayidx, align 4
				%array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
				%const_array_element = load i32, i32* %array_const_idx, align 4
				%mul = mul nsw i32 %src_element, %const_array_element
				%add = add nsw i32 %mul, %r
				%inc = add nuw nsw i64 %iv, 1
				%exitcond86.i = icmp eq i64 %inc, 9
				br i1 %exitcond86.i, label %loop.end, label %loop

				loop.end:
				%r.lcssa = phi i32 [ %r, %loop ]
				ret i32 %r.lcssa
				}

				; CHECK-LABEL: foo_prof
				; CHECK-NOT: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
				define i32 @foo_prof(i32* noalias nocapture readonly %src) !prof !15 {
				entry:
				br label %loop

				loop:
				%iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
				%r = phi i32 [ 0, %entry ], [ %add, %loop ]
				%arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
				%src_element = load i32, i32* %arrayidx, align 4
				%array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
				%const_array_element = load i32, i32* %array_const_idx, align 4
				%mul = mul nsw i32 %src_element, %const_array_element
				%add = add nsw i32 %mul, %r
				%inc = add nuw nsw i64 %iv, 1
				%exitcond86.i = icmp eq i64 %inc, 9
				br i1 %exitcond86.i, label %loop.end, label %loop, !prof !16

				loop.end:
				%r.lcssa = phi i32 [ %r, %loop ]
				ret i32 %r.lcssa
				}

				; CHECK-LABEL: bar
				; CHECK-NOT: loop.prol
				define i32 @bar(i32* noalias nocapture readonly %src, i64 %c) {
				entry:
				br label %loop

				loop:
				%iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
				%r = phi i32 [ 0, %entry ], [ %add, %loop ]
				%arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
				%src_element = load i32, i32* %arrayidx, align 4
				%array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
				%const_array_element = load i32, i32* %array_const_idx, align 4
				%mul = mul nsw i32 %src_element, %const_array_element
				%add = add nsw i32 %mul, %r
				%inc = add nuw nsw i64 %iv, 1
				%exitcond86.i = icmp eq i64 %inc, %c
				br i1 %exitcond86.i, label %loop.end, label %loop

				loop.end:
				%r.lcssa = phi i32 [ %r, %loop ]
				ret i32 %r.lcssa
				}

				; CHECK-LABEL: bar_prof
				; CHECK: loop.prol
				define i32 @bar_prof(i32* noalias nocapture readonly %src, i64 %c) !prof !15 {
				entry:
				br label %loop

				loop:
				%iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
				%r = phi i32 [ 0, %entry ], [ %add, %loop ]
				%arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
				%src_element = load i32, i32* %arrayidx, align 4
				%array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
				%const_array_element = load i32, i32* %array_const_idx, align 4
				%mul = mul nsw i32 %src_element, %const_array_element
				%add = add nsw i32 %mul, %r
				%inc = add nuw nsw i64 %iv, 1
				%exitcond86.i = icmp eq i64 %inc, %c
				br i1 %exitcond86.i, label %loop.end, label %loop, !prof !16

				loop.end:
				%r.lcssa = phi i32 [ %r, %loop ]
				ret i32 %r.lcssa
				}

				; CHECK-LABEL: bar_prof_flat
				; CHECK-NOT: loop.prol
				define i32 @bar_prof_flat(i32* noalias nocapture readonly %src, i64 %c) !prof !15 {
				entry:
				br label %loop

				loop:
				%iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
				%r = phi i32 [ 0, %entry ], [ %add, %loop ]
				%arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
				%src_element = load i32, i32* %arrayidx, align 4
				%array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
				%const_array_element = load i32, i32* %array_const_idx, align 4
				%mul = mul nsw i32 %src_element, %const_array_element
				%add = add nsw i32 %mul, %r
				%inc = add nuw nsw i64 %iv, 1
				%exitcond86.i = icmp eq i64 %inc, %c
				br i1 %exitcond86.i, label %loop, label %loop.end, !prof !16

				loop.end:
				%r.lcssa = phi i32 [ %r, %loop ]
				ret i32 %r.lcssa
				}

				!llvm.module.flags = !{!1}

				!1 = !{i32 1, !"ProfileSummary", !2}
				!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
				!3 = !{!"ProfileFormat", !"InstrProf"}
				!4 = !{!"TotalCount", i64 10000}
				!5 = !{!"MaxCount", i64 1000}
				!6 = !{!"MaxInternalCount", i64 1}
				!7 = !{!"MaxFunctionCount", i64 1000}
				!8 = !{!"NumCounts", i64 3}
				!9 = !{!"NumFunctions", i64 3}
				!10 = !{!"DetailedSummary", !11}
				!11 = !{!12, !13, !14}
				!12 = !{i32 10000, i64 100, i32 1}
				!13 = !{i32 999000, i64 100, i32 1}
				!14 = !{i32 999999, i64 1, i32 2}
				!15 = !{!"function_entry_count", i64 1}
				!16 = !{!"branch_weights", i32 1, i32 1000}