Diff 365871

llvm/test/tools/llvm-profgen/merge-cold-profile.test

	; Used the data from recursion-compression.test, refer it for the unmerged output			; Used the data from recursion-compression.test, refer it for the unmerged output
	; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t1 --compress-recursion=-1 --profile-summary-cold-count=8			; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t1 --compress-recursion=-1 --profile-summary-cold-count=8
	; RUN: FileCheck %s --input-file %t1			; RUN: FileCheck %s --input-file %t1

	; Test --csprof-trim-cold-context=0			; Test --csprof-trim-cold-context=0
	; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t2 --compress-recursion=-1 --profile-summary-cold-count=100 --csprof-trim-cold-context=0			; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t2 --compress-recursion=-1 --profile-summary-cold-count=100 --csprof-trim-cold-context=0
	; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-KEEP-COLD			; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-KEEP-COLD

	; Test --csprof-merge-cold-context=0			; Test --csprof-merge-cold-context=0
	; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t3 --compress-recursion=-1 --profile-summary-cold-count=10 --csprof-merge-cold-context=0			; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t3 --compress-recursion=-1 --profile-summary-cold-count=10 --csprof-merge-cold-context=0
	; RUN: FileCheck %s --input-file %t3 --check-prefix=CHECK-UNMERGED			; RUN: FileCheck %s --input-file %t3 --check-prefix=CHECK-UNMERGED

	; Test --csprof-frame-depth-for-cold-context			; Test --csprof-frame-depth-for-cold-context
	; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t2 --compress-recursion=-1 --profile-summary-cold-count=100 --csprof-trim-cold-context=0 --csprof-frame-depth-for-cold-context=2			; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t2 --compress-recursion=-1 --profile-summary-cold-count=100 --csprof-trim-cold-context=0 --csprof-max-cold-context-depth=2
	; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-COLD-CONTEXT-LENGTH			; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-COLD-CONTEXT-LENGTH

	; CHECK: [fa]:14:4			; CHECK: [fa]:14:4
	; CHECK-NEXT: 1: 4			; CHECK-NEXT: 1: 4
	; CHECK-NEXT: 3: 4			; CHECK-NEXT: 3: 4
	; CHECK-NEXT: 4: 2			; CHECK-NEXT: 4: 2
	; CHECK-NEXT: 5: 1			; CHECK-NEXT: 5: 1
	; CHECK-NEXT: 6: 0			; CHECK-NEXT: 6: 0
	▲ Show 20 Lines • Show All 111 Lines • Show Last 20 Lines

llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test

	; Firstly test uncompression(--compress-recursion=0)			; Firstly test uncompression(--compress-recursion=0)
	; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-cold-count=0			; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-cold-count=0
	; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-UNCOMPRESS			; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-UNCOMPRESS
	; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --profile-summary-cold-count=0			; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --profile-summary-cold-count=0
	; RUN: FileCheck %s --input-file %t			; RUN: FileCheck %s --input-file %t
				; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-cold-count=0 --csprof-max-context-depth=2
				; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH

	; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb]:48:0			; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb]:48:0
	; CHECK-UNCOMPRESS: 1: 11			; CHECK-UNCOMPRESS: 1: 11
	; CHECK-UNCOMPRESS: 2: 1 fa:1			; CHECK-UNCOMPRESS: 2: 1 fa:1
	; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa]:24:0			; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa]:24:0
	; CHECK-UNCOMPRESS: 1: 1			; CHECK-UNCOMPRESS: 1: 1
	; CHECK-UNCOMPRESS: 2: 13 fb:11			; CHECK-UNCOMPRESS: 2: 13 fb:11
	; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:7:0			; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:7:0
	; CHECK-UNCOMPRESS: 1: 1			; CHECK-UNCOMPRESS: 1: 1
	; CHECK-UNCOMPRESS: 2: 2 fb:1			; CHECK-UNCOMPRESS: 2: 2 fb:1
	; CHECK-UNCOMPRESS:[main:1 @ foo]:7:0			; CHECK-UNCOMPRESS:[main:1 @ foo]:7:0
	; CHECK-UNCOMPRESS: 2: 1			; CHECK-UNCOMPRESS: 2: 1
	; CHECK-UNCOMPRESS: 3: 2 fa:1			; CHECK-UNCOMPRESS: 3: 2 fa:1
	; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb:2 @ fa]:2:0			; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb:2 @ fa]:2:0
	; CHECK-UNCOMPRESS: 4: 1			; CHECK-UNCOMPRESS: 4: 1
	; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb]:2:0			; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb]:2:0
	; CHECK-UNCOMPRESS: 2: 1 fa:1			; CHECK-UNCOMPRESS: 2: 1 fa:1

				; CHECK-MAX-CTX-DEPTH:[foo:3 @ fa:2 @ fb]:47:0
				; CHECK-MAX-CTX-DEPTH: 1: 11
				; CHECK-MAX-CTX-DEPTH:[main:1 @ foo:3 @ fa]:13:0
				; CHECK-MAX-CTX-DEPTH: 1: 1
				; CHECK-MAX-CTX-DEPTH: 2: 2
				; CHECK-MAX-CTX-DEPTH:[fa:2 @ fb:2 @ fa]:8:0
				; CHECK-MAX-CTX-DEPTH: 1: 1
				; CHECK-MAX-CTX-DEPTH: 2: 1
				; CHECK-MAX-CTX-DEPTH: 4: 1
				; CHECK-MAX-CTX-DEPTH:[main:1 @ foo]:7:0
				; CHECK-MAX-CTX-DEPTH: 2: 1
				; CHECK-MAX-CTX-DEPTH: 3: 2 fa:1
				; CHECK-MAX-CTX-DEPTH:[fb:2 @ fa:2 @ fb]:1:0


	; CHECK: [main:1 @ foo:3 @ fa:2 @ fb]:48:0			; CHECK: [main:1 @ foo:3 @ fa:2 @ fb]:48:0
	; CHECK: 1: 11			; CHECK: 1: 11
	; CHECK: 2: 1 fa:1			; CHECK: 2: 1 fa:1
	; CHECK: [main:1 @ foo:3 @ fa]:24:0			; CHECK: [main:1 @ foo:3 @ fa]:24:0
	; CHECK: 1: 1			; CHECK: 1: 1
	; CHECK: 2: 13 fb:11			; CHECK: 2: 13 fb:11
	; CHECK: [main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:9:0			; CHECK: [main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:9:0
	Show All 35 Lines

llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test

	; Firstly test uncompression(--compress-recursion=0)			; Firstly test uncompression(--compress-recursion=0)
	; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-cold-count=0			; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-cold-count=0
	; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-UNCOMPRESS			; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-UNCOMPRESS
	; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --show-unwinder-output --profile-summary-cold-count=0 \| FileCheck %s --check-prefix=CHECK-UNWINDER			; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --show-unwinder-output --profile-summary-cold-count=0 \| FileCheck %s --check-prefix=CHECK-UNWINDER
	; RUN: FileCheck %s --input-file %t			; RUN: FileCheck %s --input-file %t
	; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe-nommap.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --show-unwinder-output --profile-summary-cold-count=0 \| FileCheck %s --check-prefix=CHECK-UNWINDER			; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe-nommap.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --show-unwinder-output --profile-summary-cold-count=0 \| FileCheck %s --check-prefix=CHECK-UNWINDER
	; RUN: FileCheck %s --input-file %t			; RUN: FileCheck %s --input-file %t
				; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-cold-count=0 --csprof-max-context-depth=0
				; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH

	; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa]:4:1			; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa]:4:1
	; CHECK-UNCOMPRESS: 1: 1			; CHECK-UNCOMPRESS: 1: 1
	; CHECK-UNCOMPRESS: 3: 1			; CHECK-UNCOMPRESS: 3: 1
	; CHECK-UNCOMPRESS: 4: 1			; CHECK-UNCOMPRESS: 4: 1
	; CHECK-UNCOMPRESS: 7: 1 fb:1			; CHECK-UNCOMPRESS: 7: 1 fb:1
	; CHECK-UNCOMPRESS: !CFGChecksum: 563070469352221			; CHECK-UNCOMPRESS: !CFGChecksum: 563070469352221
	; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa]:4:1			; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa]:4:1
	▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	; CHECK-UNCOMPRESS: 1: 1			; CHECK-UNCOMPRESS: 1: 1
	; CHECK-UNCOMPRESS: 3: 1			; CHECK-UNCOMPRESS: 3: 1
	; CHECK-UNCOMPRESS: !CFGChecksum: 563070469352221			; CHECK-UNCOMPRESS: !CFGChecksum: 563070469352221
	; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:1:0			; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:1:0
	; CHECK-UNCOMPRESS: 5: 1 fb:1			; CHECK-UNCOMPRESS: 5: 1 fb:1
	; CHECK-UNCOMPRESS: !CFGChecksum: 563022570642068			; CHECK-UNCOMPRESS: !CFGChecksum: 563022570642068


				; CHECK-MAX-CTX-DEPTH: [fb]:19:6
				; CHECK-MAX-CTX-DEPTH: 1: 6
				; CHECK-MAX-CTX-DEPTH: 2: 3
				; CHECK-MAX-CTX-DEPTH: 3: 3
				; CHECK-MAX-CTX-DEPTH: 4: 0
				; CHECK-MAX-CTX-DEPTH: 5: 4 fb:4
				; CHECK-MAX-CTX-DEPTH: 6: 3 fa:3
				; CHECK-MAX-CTX-DEPTH: !CFGChecksum: 563022570642068
				; CHECK-MAX-CTX-DEPTH: [fa]:14:4
				; CHECK-MAX-CTX-DEPTH: 1: 4
				; CHECK-MAX-CTX-DEPTH: 3: 4
				; CHECK-MAX-CTX-DEPTH: 4: 2
				; CHECK-MAX-CTX-DEPTH: 5: 1
				; CHECK-MAX-CTX-DEPTH: 6: 0
				; CHECK-MAX-CTX-DEPTH: 7: 2 fb:2
				; CHECK-MAX-CTX-DEPTH: 8: 1 fa:1
				; CHECK-MAX-CTX-DEPTH: !CFGChecksum: 563070469352221


	; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb]:13:4			; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb]:13:4
	; CHECK: 1: 4			; CHECK: 1: 4
	; CHECK: 2: 3			; CHECK: 2: 3
	; CHECK: 3: 1			; CHECK: 3: 1
	; CEHCK: 5: 4 fb:4			; CEHCK: 5: 4 fb:4
	; CHECK: 6: 1 fa:1			; CHECK: 6: 1 fa:1
	; CHECK !CFGChecksum: 563022570642068			; CHECK !CFGChecksum: 563022570642068
	; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa]:6:2			; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa]:6:2
	▲ Show 20 Lines • Show All 100 Lines • Show Last 20 Lines

llvm/tools/llvm-profgen/PerfReader.cpp

	Show All 14 Lines

	static cl::opt<bool> ShowUnwinderOutput("show-unwinder-output",			static cl::opt<bool> ShowUnwinderOutput("show-unwinder-output",
	cl::ReallyHidden, cl::init(false),			cl::ReallyHidden, cl::init(false),
	cl::ZeroOrMore,			cl::ZeroOrMore,
	cl::desc("Print unwinder output"));			cl::desc("Print unwinder output"));

	extern cl::opt<bool> ShowDisassemblyOnly;			extern cl::opt<bool> ShowDisassemblyOnly;
	extern cl::opt<bool> ShowSourceLocations;			extern cl::opt<bool> ShowSourceLocations;

				hoyUnsubmitted Not Done Reply Inline Actions Nit: move this into ProfileGenerator.h to reduct the number of declarations? hoy: Nit: move this into ProfileGenerator.h to reduct the number of declarations?
				wleiAuthorUnsubmitted Done Reply Inline Actions fixed! wlei: fixed!
	namespace llvm {			namespace llvm {
	namespace sampleprof {			namespace sampleprof {

	void VirtualUnwinder::unwindCall(UnwindState &State) {			void VirtualUnwinder::unwindCall(UnwindState &State) {
	// The 2nd frame after leaf could be missing if stack sample is			// The 2nd frame after leaf could be missing if stack sample is
	// taken when IP is within prolog/epilog, as frame chain isn't			// taken when IP is within prolog/epilog, as frame chain isn't
	// setup yet. Fill in the missing frame in that case.			// setup yet. Fill in the missing frame in that case.
	// TODO: Currently we just assume all the addr that can't match the			// TODO: Currently we just assume all the addr that can't match the
	▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines
	std::shared_ptr<ProbeBasedCtxKey> ProbeStack::getContextKey() {			std::shared_ptr<ProbeBasedCtxKey> ProbeStack::getContextKey() {
	std::shared_ptr<ProbeBasedCtxKey> ProbeBasedKey =			std::shared_ptr<ProbeBasedCtxKey> ProbeBasedKey =
	std::make_shared<ProbeBasedCtxKey>();			std::make_shared<ProbeBasedCtxKey>();
	for (auto CallProbe : Stack) {			for (auto CallProbe : Stack) {
	ProbeBasedKey->Probes.emplace_back(CallProbe);			ProbeBasedKey->Probes.emplace_back(CallProbe);
	}			}
	CSProfileGenerator::compressRecursionContext<const MCDecodedPseudoProbe *>(			CSProfileGenerator::compressRecursionContext<const MCDecodedPseudoProbe *>(
	ProbeBasedKey->Probes);			ProbeBasedKey->Probes);
				CSProfileGenerator::trimContext<const MCDecodedPseudoProbe *>(
				ProbeBasedKey->Probes);

	ProbeBasedKey->genHashCode();			ProbeBasedKey->genHashCode();
	return ProbeBasedKey;			return ProbeBasedKey;
	}			}

	template <typename T>			template <typename T>
	void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur,			void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur,
	T &Stack) {			T &Stack) {
	if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty())			if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty())
	▲ Show 20 Lines • Show All 652 Lines • Show Last 20 Lines

llvm/tools/llvm-profgen/ProfileGenerator.h

Show First 20 Lines • Show All 64 Lines • ▼ Show 20 Lines

public:		public:
CSProfileGenerator(const BinarySampleCounterMap &Counters)		CSProfileGenerator(const BinarySampleCounterMap &Counters)
: BinarySampleCounters(Counters){};		: BinarySampleCounters(Counters){};

public:		public:
void generateProfile() override;		void generateProfile() override;

		// Trim the context stack at a given depth.
		wenleiUnsubmitted Not Done Reply Inline Actions nit: bottom-up order in stack is usually callers-callee order, from bottom can be confusing as it means we trim callees which is not the case. also suggest rename capContextStack to trimContext. wenlei: nit: bottom-up order in stack is usually callers-callee order, from bottom can be confusing as…
		wleiAuthorUnsubmitted Done Reply Inline Actions Fixed! wlei: Fixed!
		template <typename T>
		static void trimContext(SmallVectorImpl<T> &S, int Depth = MaxContextDepth) {
		if (Depth < 0 \|\| static_cast<size_t>(Depth) >= S.size())
		return;
		std::copy(S.begin() + S.size() - static_cast<size_t>(Depth), S.end(),
		S.begin());
		S.resize(Depth);
		}

// Remove adjacent repeated context sequences up to a given sequence length,		// Remove adjacent repeated context sequences up to a given sequence length,
// -1 means no size limit. Note that repeated sequences are identified based		// -1 means no size limit. Note that repeated sequences are identified based
// on the exact call site, this is finer granularity than function recursion.		// on the exact call site, this is finer granularity than function recursion.
template <typename T>		template <typename T>
static void compressRecursionContext(SmallVectorImpl<T> &Context,		static void compressRecursionContext(SmallVectorImpl<T> &Context,
int32_t CSize = MaxCompressionSize) {		int32_t CSize = MaxCompressionSize) {
uint32_t I = 1;		uint32_t I = 1;
uint32_t HS = static_cast<uint32_t>(Context.size() / 2);		uint32_t HS = static_cast<uint32_t>(Context.size() / 2);
▲ Show 20 Lines • Show All 126 Lines • ▼ Show 20 Lines	void populateFunctionBoundarySamples(StringRef ContextId,
const BranchSample &BranchCounters,		const BranchSample &BranchCounters,
ProfiledBinary *Binary);		ProfiledBinary *Binary);
void populateInferredFunctionSamples();		void populateInferredFunctionSamples();

public:		public:
// Deduplicate adjacent repeated context sequences up to a given sequence		// Deduplicate adjacent repeated context sequences up to a given sequence
// length. -1 means no size limit.		// length. -1 means no size limit.
static int32_t MaxCompressionSize;		static int32_t MaxCompressionSize;
		static int MaxContextDepth;
};		};

using ProbeCounterMap =		using ProbeCounterMap =
std::unordered_map<const MCDecodedPseudoProbe *, uint64_t>;		std::unordered_map<const MCDecodedPseudoProbe *, uint64_t>;

class PseudoProbeCSProfileGenerator : public CSProfileGenerator {		class PseudoProbeCSProfileGenerator : public CSProfileGenerator {

public:		public:
Show All 35 Lines

llvm/tools/llvm-profgen/ProfileGenerator.cpp

Show All 38 Lines	cl::desc("If the total count of context profile is smaller than "
"the threshold, it will be merged into context-less base "		"the threshold, it will be merged into context-less base "
"profile."));		"profile."));

static cl::opt<bool> CSProfTrimColdContext(		static cl::opt<bool> CSProfTrimColdContext(
"csprof-trim-cold-context", cl::init(true), cl::ZeroOrMore,		"csprof-trim-cold-context", cl::init(true), cl::ZeroOrMore,
cl::desc("If the total count of the profile after all merge is done "		cl::desc("If the total count of the profile after all merge is done "
"is still smaller than threshold, it will be trimmed."));		"is still smaller than threshold, it will be trimmed."));

static cl::opt<uint32_t> CSProfColdContextFrameDepth(		static cl::opt<uint32_t> CSProfMaxColdContextDepth(
"csprof-frame-depth-for-cold-context", cl::init(1), cl::ZeroOrMore,		"csprof-max-cold-context-depth", cl::init(1), cl::ZeroOrMore,
cl::desc("Keep the last K frames while merging cold profile. 1 means the "		cl::desc("Keep the last K contexts while merging cold profile. 1 means the "
"context-less base profile"));		"context-less base profile"));

		static cl::opt<int, true> CSProfMaxContextDepth(
		"csprof-max-context-depth", cl::ZeroOrMore,
		wenleiUnsubmitted Not Done Reply Inline Actions I think we could unify the switch names, e.g. `csprof-max-context-depth` and `csprof-max-cold-context-depth`? wenlei: I think we could unify the switch names, e.g. `csprof-max-context-depth` and `csprof-max-cold…
		hoyUnsubmitted Not Done Reply Inline Actions Thanks for working on this. We probably do not inline so many levels of functions. But would be good to run through some perf testing or to turn this off by default. hoy: Thanks for working on this. We probably do not inline so many levels of functions. But would be…
		wleiAuthorUnsubmitted Done Reply Inline Actions Sounds good, will collect the statistic of the max inline depth in SampleProfile inliner on some benchmarks and change to that one, maybe 10 is good enough. wlei: Sounds good, will collect the statistic of the max inline depth in SampleProfile inliner on…
		wleiAuthorUnsubmitted Done Reply Inline Actions Here is the max inline depth(a inline b, then b inline c, the depth is 2) in SPEC2017 monoLTO pass2 (turn on all inliners). 508.namd_r 5 510.parest_r 21 511.povray_r 8 526.blender_r 15 600.perlbench_s 8 602.gcc_s 21 605.mcf_s 5 620.omnetpp_s 18 623.xalancbmk_s 26 625.x264_s 7 631.deepsjeng_s 5 638.imagick_s 10 641.leela_s 16 644.nab_s 5 657.xz_s 7 and for the clang-10 pass1 binary(I don't have pass2 binary), the max inline depth is 51! it's really more inlining than I thought. so I agree with you to turn it off(-1) by default. wlei: Here is the max inline depth(a inline b, then b inline c, the depth is 2) in SPEC2017 monoLTO…
		wenleiUnsubmitted Not Done Reply Inline Actions Sounds reasonable. If we run into such situation more often, we could also try to have another level of aggregation by leaf frame from stack sample, then we can tell some contexts are cold before unwinding, and dynamically trim those cold context during unwinding. Can we make the description and variable name consistent with CSProfColdContextFrameDepth too? wenlei: Sounds reasonable. If we run into such situation more often, we could also try to have another…
		wenleiUnsubmitted Not Done Reply Inline Actions to be specific: CSProfMaxContextDepth, CSProfMaxColdContextDepth. "Keep the last K frames while merging [cold] profile ..." otherwise the change looks good. wenlei: to be specific: CSProfMaxContextDepth, CSProfMaxColdContextDepth. "Keep the last K frames while…
		cl::desc("Keep the last K contexts while merging profile. -1 means no "
		"depth limit."),
		cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));

static cl::opt<bool> EnableCSPreInliner(		static cl::opt<bool> EnableCSPreInliner(
"csspgo-preinliner", cl::Hidden, cl::init(false),		"csspgo-preinliner", cl::Hidden, cl::init(false),
cl::desc("Run a global pre-inliner to merge context profile based on "		cl::desc("Run a global pre-inliner to merge context profile based on "
"estimated global top-down inline decisions"));		"estimated global top-down inline decisions"));

extern cl::opt<int> ProfileSummaryCutoffCold;		extern cl::opt<int> ProfileSummaryCutoffCold;

using namespace llvm;		using namespace llvm;
using namespace sampleprof;		using namespace sampleprof;

namespace llvm {		namespace llvm {
namespace sampleprof {		namespace sampleprof {

// Initialize the MaxCompressionSize to -1 which means no size limit		// Initialize the MaxCompressionSize to -1 which means no size limit
int32_t CSProfileGenerator::MaxCompressionSize = -1;		int32_t CSProfileGenerator::MaxCompressionSize = -1;

		int CSProfileGenerator::MaxContextDepth = -1;
		spupyrevUnsubmitted Not Done Reply Inline Actions The default value of "no trimming" sounds strange to me. Do we know if trimming may affect performance? If yes, we shouldn't trim. If no, we should have a reasonable bound here. spupyrev: The default value of "no trimming" sounds strange to me. Do we know if trimming may affect…
		wleiAuthorUnsubmitted Done Reply Inline Actions Do we know if trimming may affect performance? It depends on how compiler consumes the context, if the profile context is [a @ b @ c @ d] but compiler only inline d to c and stop inlining at c, then the [a @ b @] can be trimmed. Another things is the profile size tradeoff, before we did merge and trim cold profile after all samples is generated. This patch tried to move the merging ahead as we encountered the memory issues. it doesn't drop the sample. If no, we should have a reasonable bound here. Good question, but I don't know what's the reasonable bound yet. As you see the above comments, SPEC benchmark's max inline depth can vary from 5 to 26, the clang-10 binary even have a 51 max depth. and we didn't get memory issue for SPEC, so SPEC can set to -1. so it's the tradeoff between many things, what's in my mind is to default it `-1` and we can tune it on-demand. what do you think? wlei: > Do we know if trimming may affect performance? It depends on how compiler consumes the…
		hoyUnsubmitted Not Done Reply Inline Actions Default -1 (means no trimming) sounds good to me. An otherwise decent default value would need quite some experiments to justify. hoy: Default -1 (means no trimming) sounds good to me. An otherwise decent default value would need…
		spupyrevUnsubmitted Not Done Reply Inline Actions I dislike tunable flags that may or may not affect the performance. I assume most (all?) of the end-users won't know about the flag and will utilize the default value. So whatever you set here will be used in the vast majority of cases. If course, if you do plan to run some perf experiments in the (near) future and tune the default value, then the current value of "-1" is OK. spupyrev: I dislike tunable flags that may or may not affect the performance. I assume most (all?) of the…
		wleiAuthorUnsubmitted Done Reply Inline Actions I dislike tunable flags that may or may not affect the performance. I assume most (all?) of the end-users won't know about the flag and will utilize the default value. So whatever you set here will be used in the vast majority of cases. That makes sense, thanks! Will tune it, maybe clang binary is the good place as it has the deep call stack. wlei: > I dislike tunable flags that may or may not affect the performance. I assume most (all?) of…

static bool		static bool
usePseudoProbes(const BinarySampleCounterMap &BinarySampleCounters) {		usePseudoProbes(const BinarySampleCounterMap &BinarySampleCounters) {
return BinarySampleCounters.size() &&		return BinarySampleCounters.size() &&
BinarySampleCounters.begin()->first->usePseudoProbes();		BinarySampleCounters.begin()->first->usePseudoProbes();
}		}

std::unique_ptr<ProfileGenerator>		std::unique_ptr<ProfileGenerator>
ProfileGenerator::create(const BinarySampleCounterMap &BinarySampleCounters,		ProfileGenerator::create(const BinarySampleCounterMap &BinarySampleCounters,
▲ Show 20 Lines • Show All 334 Lines • ▼ Show 20 Lines	void CSProfileGenerator::postProcessProfiles() {
// inline decisions.		// inline decisions.
if (EnableCSPreInliner)		if (EnableCSPreInliner)
CSPreInliner(ProfileMap, HotCountThreshold, ColdCountThreshold).run();		CSPreInliner(ProfileMap, HotCountThreshold, ColdCountThreshold).run();

// Trim and merge cold context profile using cold threshold above;		// Trim and merge cold context profile using cold threshold above;
SampleContextTrimmer(ProfileMap)		SampleContextTrimmer(ProfileMap)
.trimAndMergeColdContextProfiles(		.trimAndMergeColdContextProfiles(
ColdCountThreshold, CSProfTrimColdContext, CSProfMergeColdContext,		ColdCountThreshold, CSProfTrimColdContext, CSProfMergeColdContext,
CSProfColdContextFrameDepth);		CSProfMaxColdContextDepth);
}		}

void CSProfileGenerator::computeSummaryAndThreshold() {		void CSProfileGenerator::computeSummaryAndThreshold() {
// Update the default value of cold cutoff for llvm-profgen.		// Update the default value of cold cutoff for llvm-profgen.
// Do it here because we don't want to change the global default,		// Do it here because we don't want to change the global default,
// which would lead CS profile size too large.		// which would lead CS profile size too large.
if (!ProfileSummaryCutoffCold.getNumOccurrences())		if (!ProfileSummaryCutoffCold.getNumOccurrences())
ProfileSummaryCutoffCold = 999000;		ProfileSummaryCutoffCold = 999000;
▲ Show 20 Lines • Show All 176 Lines • ▼ Show 20 Lines
FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(		FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
SmallVectorImpl<std::string> &ContextStrStack,		SmallVectorImpl<std::string> &ContextStrStack,
const MCPseudoProbeFuncDesc *LeafFuncDesc, bool WasLeafInlined) {		const MCPseudoProbeFuncDesc *LeafFuncDesc, bool WasLeafInlined) {
assert(ContextStrStack.size() && "Profile context must have the leaf frame");		assert(ContextStrStack.size() && "Profile context must have the leaf frame");
// Compress the context string except for the leaf frame		// Compress the context string except for the leaf frame
std::string LeafFrame = ContextStrStack.back();		std::string LeafFrame = ContextStrStack.back();
ContextStrStack.pop_back();		ContextStrStack.pop_back();
CSProfileGenerator::compressRecursionContext(ContextStrStack);		CSProfileGenerator::compressRecursionContext(ContextStrStack);
		CSProfileGenerator::trimContext(ContextStrStack);
		wenleiUnsubmitted Not Done Reply Inline Actions Since `getExpandedContextStr` only covers line-based profile, for probe we rely on the trimming here in profile generation, which is later then where we do the trimming for line-based profile. Do we see peak memory drop if we trim the context in profile generation instead of during unwinder? wenlei: Since `getExpandedContextStr` only covers line-based profile, for probe we rely on the trimming…
		wleiAuthorUnsubmitted Done Reply Inline Actions Here it did for both, one during unwinder(see the one in PerfReader.cpp) and one here. The answer is yes, it's better than unwinder only, here are some data: 10 depth for both: 17GB 10 depth for unwinder only: 26GB 20 depth for both: 42GB 20 depth for unwinder only: 49GB wlei: Here it did for both, one during unwinder(see the one in PerfReader.cpp) and one here. The…

std::ostringstream OContextStr;		std::ostringstream OContextStr;
for (uint32_t I = 0; I < ContextStrStack.size(); I++) {		for (uint32_t I = 0; I < ContextStrStack.size(); I++) {
if (OContextStr.str().size())		if (OContextStr.str().size())
OContextStr << " @ ";		OContextStr << " @ ";
OContextStr << ContextStrStack[I];		OContextStr << ContextStrStack[I];
}		}
// For leaf inlined context with the top frame, we should strip off the top		// For leaf inlined context with the top frame, we should strip off the top
Show All 28 Lines

llvm/tools/llvm-profgen/ProfiledBinary.cpp

Show First 20 Lines • Show All 119 Lines • ▼ Show 20 Lines	for (auto Address : Stack) {
}		}
}		}

assert(ContextVec.size() && "Context length should be at least 1");		assert(ContextVec.size() && "Context length should be at least 1");
// Compress the context string except for the leaf frame		// Compress the context string except for the leaf frame
std::string LeafFrame = ContextVec.back();		std::string LeafFrame = ContextVec.back();
ContextVec.pop_back();		ContextVec.pop_back();
CSProfileGenerator::compressRecursionContext<std::string>(ContextVec);		CSProfileGenerator::compressRecursionContext<std::string>(ContextVec);
		CSProfileGenerator::trimContext<std::string>(ContextVec);

std::ostringstream OContextStr;		std::ostringstream OContextStr;
for (uint32_t I = 0; I < (uint32_t)ContextVec.size(); I++) {		for (uint32_t I = 0; I < (uint32_t)ContextVec.size(); I++) {
if (OContextStr.str().size()) {		if (OContextStr.str().size()) {
OContextStr << " @ ";		OContextStr << " @ ";
}		}
OContextStr << ContextVec[I];		OContextStr << ContextVec[I];
}		}
▲ Show 20 Lines • Show All 333 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[CSSPGO][llvm-profgen] Trim and merge context beforehand to reduce memory usage
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 365871

llvm/test/tools/llvm-profgen/merge-cold-profile.test

llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test

llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test

llvm/tools/llvm-profgen/PerfReader.cpp

llvm/tools/llvm-profgen/ProfileGenerator.h

llvm/tools/llvm-profgen/ProfileGenerator.cpp

llvm/tools/llvm-profgen/ProfiledBinary.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[CSSPGO][llvm-profgen] Trim and merge context beforehand to reduce memory usageClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 365871

llvm/test/tools/llvm-profgen/merge-cold-profile.test

llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test

llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test

llvm/tools/llvm-profgen/PerfReader.cpp

llvm/tools/llvm-profgen/ProfileGenerator.h

llvm/tools/llvm-profgen/ProfileGenerator.cpp

llvm/tools/llvm-profgen/ProfiledBinary.cpp

[CSSPGO][llvm-profgen] Trim and merge context beforehand to reduce memory usage
ClosedPublic