Diff 279589

llvm/lib/CodeGen/MachineBlockPlacement.cpp

Show First 20 Lines • Show All 171 Lines • ▼ Show 20 Lines	static cl::opt<unsigned> TailDupPlacementPenalty(
"tail-dup-placement-penalty",		"tail-dup-placement-penalty",
cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "		cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
"Copying can increase fallthrough, but it also increases icache "		"Copying can increase fallthrough, but it also increases icache "
"pressure. This parameter controls the penalty to account for that. "		"pressure. This parameter controls the penalty to account for that. "
"Percent as integer."),		"Percent as integer."),
cl::init(2),		cl::init(2),
cl::Hidden);		cl::Hidden);

		// Heuristic for tail duplication if profile count is used in cost model.
		static cl::opt<unsigned> TailDupProfilePercentThreshold(
		davidxlUnsubmitted Done Reply Inline Actions Nit: TailDupProfilePercentThreshold davidxl: Nit: TailDupProfilePercentThreshold
		"tail-dup-profile-percent-threshold",
		cl::desc("If profile count information is used in tail duplication cost "
		"model, the gained fall through number from tail duplication "
		"should be at least this percent of hot count."),
		cl::init(50), cl::Hidden);

// Heuristic for triangle chains.		// Heuristic for triangle chains.
static cl::opt<unsigned> TriangleChainCount(		static cl::opt<unsigned> TriangleChainCount(
"triangle-chain-count",		"triangle-chain-count",
cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "		cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
"triangle tail duplication heuristic to kick in. 0 to disable."),		"triangle tail duplication heuristic to kick in. 0 to disable."),
cl::init(2),		cl::init(2),
cl::Hidden);		cl::Hidden);

▲ Show 20 Lines • Show All 184 Lines • ▼ Show 20 Lines	class MachineBlockPlacement : public MachineFunctionPass {
/// Placement decisions can open up new tail duplication opportunities, but		/// Placement decisions can open up new tail duplication opportunities, but
/// since tail duplication affects placement decisions of later blocks, it		/// since tail duplication affects placement decisions of later blocks, it
/// must be done inline.		/// must be done inline.
TailDuplicator TailDup;		TailDuplicator TailDup;

/// Partial tail duplication threshold.		/// Partial tail duplication threshold.
BlockFrequency DupThreshold;		BlockFrequency DupThreshold;

		/// True: use block profile count to compute tail duplication cost.
		/// False: use block frequency to compute tail duplication cost.
		bool UseProfileCount;

/// Allocator and owner of BlockChain structures.		/// Allocator and owner of BlockChain structures.
///		///
/// We build BlockChains lazily while processing the loop structure of		/// We build BlockChains lazily while processing the loop structure of
/// a function. To reduce malloc traffic, we allocate them using this		/// a function. To reduce malloc traffic, we allocate them using this
/// slab-like allocator, and destroy them after the pass completes. An		/// slab-like allocator, and destroy them after the pass completes. An
/// important guarantee is that this allocator produces stable pointers to		/// important guarantee is that this allocator produces stable pointers to
/// the chains.		/// the chains.
SpecificBumpPtrAllocator<BlockChain> ChainAllocator;		SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
Show All 9 Lines
#ifndef NDEBUG		#ifndef NDEBUG
/// The set of basic blocks that have terminators that cannot be fully		/// The set of basic blocks that have terminators that cannot be fully
/// analyzed. These basic blocks cannot be re-ordered safely by		/// analyzed. These basic blocks cannot be re-ordered safely by
/// MachineBlockPlacement, and we must preserve physical layout of these		/// MachineBlockPlacement, and we must preserve physical layout of these
/// blocks and their successors through the pass.		/// blocks and their successors through the pass.
SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;		SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
#endif		#endif

		/// Get block profile count or frequency according to UseProfileCount.
		/// The return value is used to model tail duplication cost.
		BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
		davidxlUnsubmitted Not Done Reply Inline Actions This should probbaly return Optional<..> davidxl: This should probbaly return Optional<..>
		CarrotAuthorUnsubmitted Done Reply Inline Actions See following reply. Carrot: See following reply.
		if (UseProfileCount) {
		auto Count = MBFI->getMBFI().getBlockProfileCount(BB);
		if (Count)
		return *Count;
		else
		return 0;
		davidxlUnsubmitted Not Done Reply Inline Actions Return None here. Also when None is returned, I think the caller needs to handle it conservatively -- perhaps resort to Freq based method davidxl: Return None here. Also when None is returned, I think the caller needs to handle it…
		CarrotAuthorUnsubmitted Done Reply Inline Actions Look at the implementation of getBlockProfileCount and getBlockFreq. BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock BB) const { return BFI ? BFI->getBlockFreq(BB) : 0; } Optional<uint64_t> BlockFrequencyInfo::getBlockProfileCount(const BasicBlock BB, bool AllowSynthetic) const { if (!BFI) return None; return BFI->getBlockProfileCount(getFunction(), BB, AllowSynthetic); } In the same condition (!BFI), neither function returns meaningful result, but one returuns None, another returns 0. Another situation None may be returned is the function has no profile count, so the result is actually a meaningful 0. Carrot:* Look at the implementation of getBlockProfileCount and getBlockFreq. BlockFrequency…
		} else
		return MBFI->getBlockFreq(BB);
		}

/// Scale the DupThreshold according to basic block size.		/// Scale the DupThreshold according to basic block size.
BlockFrequency scaleThreshold(MachineBasicBlock *BB);		BlockFrequency scaleThreshold(MachineBasicBlock *BB);
void initDupThreshold();		void initDupThreshold();

/// Decrease the UnscheduledPredecessors count for all blocks in chain, and		/// Decrease the UnscheduledPredecessors count for all blocks in chain, and
/// if the count goes to 0, add them to the appropriate work list.		/// if the count goes to 0, add them to the appropriate work list.
void markChainSuccessors(		void markChainSuccessors(
const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,		const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
▲ Show 20 Lines • Show All 2,702 Lines • ▼ Show 20 Lines	for (MachineBasicBlock *Succ : Pred->successors())
}		}

BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);		BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
if (BBProb <= BestProb)		if (BBProb <= BestProb)
return false;		return false;

// Compute the number of reduced taken branches if Pred falls through to BB		// Compute the number of reduced taken branches if Pred falls through to BB
// instead of another successor. Then compare it with threshold.		// instead of another successor. Then compare it with threshold.
BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);		BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
BlockFrequency Gain = PredFreq * (BBProb - BestProb);		BlockFrequency Gain = PredFreq * (BBProb - BestProb);
return Gain > scaleThreshold(BB);		return Gain > scaleThreshold(BB);
}		}

// Find out the predecessors of BB and BB can be beneficially duplicated into		// Find out the predecessors of BB and BB can be beneficially duplicated into
// them.		// them.
void MachineBlockPlacement::findDuplicateCandidates(		void MachineBlockPlacement::findDuplicateCandidates(
SmallVectorImpl<MachineBasicBlock *> &Candidates,		SmallVectorImpl<MachineBasicBlock *> &Candidates,
MachineBasicBlock *BB,		MachineBasicBlock *BB,
BlockFilterSet *BlockFilter) {		BlockFilterSet *BlockFilter) {
MachineBasicBlock *Fallthrough = nullptr;		MachineBasicBlock *Fallthrough = nullptr;
BranchProbability DefaultBranchProb = BranchProbability::getZero();		BranchProbability DefaultBranchProb = BranchProbability::getZero();
BlockFrequency BBDupThreshold(scaleThreshold(BB));		BlockFrequency BBDupThreshold(scaleThreshold(BB));
SmallVector<MachineBasicBlock *, 8> Preds(BB->pred_begin(), BB->pred_end());		SmallVector<MachineBasicBlock *, 8> Preds(BB->pred_begin(), BB->pred_end());
SmallVector<MachineBasicBlock *, 8> Succs(BB->succ_begin(), BB->succ_end());		SmallVector<MachineBasicBlock *, 8> Succs(BB->succ_begin(), BB->succ_end());

		davidxlUnsubmitted Not Done Reply Inline Actions Is this change related? davidxl: Is this change related?
		CarrotAuthorUnsubmitted Done Reply Inline Actions It is not profile count cost model related. It is a different tail dup improvement. Do you want me to send another patch for it? Carrot: It is not profile count cost model related. It is a different tail dup improvement. Do you want…
		davidxlUnsubmitted Not Done Reply Inline Actions Better separate it. Is the contributing to the performance improvement mentioned? davidxl: Better separate it. Is the contributing to the performance improvement mentioned?
		CarrotAuthorUnsubmitted Done Reply Inline Actions It doesn't cause visible performance impact. Carrot: It doesn't cause visible performance impact.
// Sort for highest frequency.		// Sort for highest frequency.
auto CmpSucc = [&](MachineBasicBlock A, MachineBasicBlock B) {		auto CmpSucc = [&](MachineBasicBlock A, MachineBasicBlock B) {
return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);		return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
};		};
auto CmpPred = [&](MachineBasicBlock A, MachineBasicBlock B) {		auto CmpPred = [&](MachineBasicBlock A, MachineBasicBlock B) {
return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);		return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
};		};
llvm::stable_sort(Succs, CmpSucc);		llvm::stable_sort(Succs, CmpSucc);
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	void MachineBlockPlacement::findDuplicateCandidates(
// duplicated into PB, and one successor is layout after it (SB1 for PB1 and		// duplicated into PB, and one successor is layout after it (SB1 for PB1 and
// SB2 for PB2 in our case). If there is no available successor, the combined		// SB2 for PB2 in our case). If there is no available successor, the combined
// block jumps to all BB's successor, like PB3 in this example.		// block jumps to all BB's successor, like PB3 in this example.
//		//
// If a predecessor has multiple successors, so BB can't be duplicated into		// If a predecessor has multiple successors, so BB can't be duplicated into
// it. But it can beneficially fall through to BB, and duplicate BB into other		// it. But it can beneficially fall through to BB, and duplicate BB into other
// predecessors.		// predecessors.
for (MachineBasicBlock *Pred : Preds) {		for (MachineBasicBlock *Pred : Preds) {
BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);		BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);

if (!TailDup.canTailDuplicate(BB, Pred)) {		if (!TailDup.canTailDuplicate(BB, Pred)) {
// BB can't be duplicated into Pred, but it is possible to be layout		// BB can't be duplicated into Pred, but it is possible to be layout
// below Pred.		// below Pred.
if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {		if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
Fallthrough = Pred;		Fallthrough = Pred;
if (SuccIt != Succs.end())		if (SuccIt != Succs.end())
SuccIt++;		SuccIt++;
Show All 32 Lines	void MachineBlockPlacement::findDuplicateCandidates(
}		}
}		}

void MachineBlockPlacement::initDupThreshold() {		void MachineBlockPlacement::initDupThreshold() {
DupThreshold = 0;		DupThreshold = 0;
if (!F->getFunction().hasProfileData())		if (!F->getFunction().hasProfileData())
return;		return;

		// We prefer to use prifile count.
		uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
		if (HotThreshold != UINT64_MAX) {
		UseProfileCount = true;
		DupThreshold = HotThreshold * TailDupProfilePercentThreshold / 100;
		return;
		}

		// Profile count is not available, we can use block frequency instead.
BlockFrequency MaxFreq = 0;		BlockFrequency MaxFreq = 0;
for (MachineBasicBlock &MBB : *F) {		for (MachineBasicBlock &MBB : *F) {
BlockFrequency Freq = MBFI->getBlockFreq(&MBB);		BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
if (Freq > MaxFreq)		if (Freq > MaxFreq)
MaxFreq = Freq;		MaxFreq = Freq;
}		}

// FIXME: we may use profile count instead of frequency,
// and need more fine tuning.
BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);		BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
DupThreshold = MaxFreq * ThresholdProb;		DupThreshold = MaxFreq * ThresholdProb;
		UseProfileCount = false;
}		}

bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {		bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))		if (skipFunction(MF.getFunction()))
return false;		return false;

// Check for single-block functions and skip them.		// Check for single-block functions and skip them.
if (std::next(MF.begin()) == MF.end())		if (std::next(MF.begin()) == MF.end())
▲ Show 20 Lines • Show All 185 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/dup-cost.ll

This file was added.

				; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s

				; Cold function, %dup should not be duplicated into predecessors.
				define i32 @cold(i32 %a, i32* %p, i32* %q) !prof !21 {
				; CHECK-LABEL: cold
				; CHECK: %entry
				; CHECK: %true1
				; CHECK: %dup
				; CHECK: %true2
				; CHECK: %false1
				; CHECK: %false2
				entry:
				%cond1 = icmp sgt i32 %a, 1
				br i1 %cond1, label %true1, label %false1, !prof !30

				true1:
				%v1 = load i32, i32* %p, align 4
				%v2 = add i32 %v1, 2
				br label %dup

				false1:
				%v3 = load i32, i32* %q, align 4
				%v4 = sub i32 %v3, 3
				br label %dup

				dup:
				%v5 = phi i32 [%v2, %true1], [%v4, %false1]
				%cond2 = icmp sgt i32 %v5, 4
				br i1 %cond2, label %true2, label %false2, !prof !30

				true2:
				%v6 = xor i32 %v5, %a
				br label %exit

				false2:
				%v7 = and i32 %v5, %a
				br label %exit

				exit:
				%v8 = phi i32 [%v6, %true2], [%v7, %false2]
				ret i32 %v8
				}

				; Same code as previous function, but with hot profile count.
				; So %dup should be duplicated into predecessors.
				define i32 @hot(i32 %a, i32* %p, i32* %q) !prof !22 {
				; CHECK-LABEL: hot
				; CHECK: %entry
				; CHECK: %true1
				; CHECK: %false2
				; CHECK: %false1
				; CHECK: %true2
				entry:
				%cond1 = icmp sgt i32 %a, 1
				br i1 %cond1, label %true1, label %false1, !prof !30

				true1:
				%v1 = load i32, i32* %p, align 4
				%v2 = add i32 %v1, 2
				br label %dup

				false1:
				%v3 = load i32, i32* %q, align 4
				%v4 = sub i32 %v3, 3
				br label %dup

				dup:
				%v5 = phi i32 [%v2, %true1], [%v4, %false1]
				%cond2 = icmp sgt i32 %v5, 4
				br i1 %cond2, label %true2, label %false2, !prof !30

				true2:
				%v6 = xor i32 %v5, %a
				br label %exit

				false2:
				%v7 = and i32 %v5, %a
				br label %exit

				exit:
				%v8 = phi i32 [%v6, %true2], [%v7, %false2]
				ret i32 %v8
				}


				!llvm.module.flags = !{!1}
				!21 = !{!"function_entry_count", i64 10}
				!22 = !{!"function_entry_count", i64 400}

				!30 = !{!"branch_weights", i32 1, i32 1}

				!1 = !{i32 1, !"ProfileSummary", !2}
				!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
				!3 = !{!"ProfileFormat", !"InstrProf"}
				!4 = !{!"TotalCount", i64 10000}
				!5 = !{!"MaxCount", i64 10}
				!6 = !{!"MaxInternalCount", i64 1}
				!7 = !{!"MaxFunctionCount", i64 1000}
				!8 = !{!"NumCounts", i64 3}
				!9 = !{!"NumFunctions", i64 3}
				!10 = !{!"DetailedSummary", !11}
				!11 = !{!12, !13, !14}
				!12 = !{i32 10000, i64 100, i32 1}
				!13 = !{i32 999000, i64 100, i32 1}
				!14 = !{i32 999999, i64 1, i32 2}

This is an archive of the discontinued LLVM Phabricator instance.

[MBP] Use profile count to compute tail dup cost if it is available
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 279589

llvm/lib/CodeGen/MachineBlockPlacement.cpp

llvm/test/CodeGen/X86/dup-cost.ll

This is an archive of the discontinued LLVM Phabricator instance.

[MBP] Use profile count to compute tail dup cost if it is availableClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 279589

llvm/lib/CodeGen/MachineBlockPlacement.cpp

llvm/test/CodeGen/X86/dup-cost.ll

[MBP] Use profile count to compute tail dup cost if it is available
ClosedPublic