Diff 85180

lib/Transforms/Instrumentation/PGOInstrumentation.cpp

Show First 20 Lines • Show All 143 Lines • ▼ Show 20 Lines
// Command line option to enable/disable select instruction instrumentation.		// Command line option to enable/disable select instruction instrumentation.
static cl::opt<bool> PGOInstrSelect("pgo-instr-select", cl::init(true),		static cl::opt<bool> PGOInstrSelect("pgo-instr-select", cl::init(true),
cl::Hidden);		cl::Hidden);

// Command line option to enable/disable memop intrinsic calls..		// Command line option to enable/disable memop intrinsic calls..
static cl::opt<bool> PGOInstrMemOP("pgo-instr-memop", cl::init(true),		static cl::opt<bool> PGOInstrMemOP("pgo-instr-memop", cl::init(true),
cl::Hidden);		cl::Hidden);

		// The minimum call count to optimize memory intrinsic calls.
		static cl::opt<unsigned>
		MemOPCountThreshold("memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
		cl::init(1000),
		cl::desc("The minimum count to optimize memory "
		"intrinsic calls"));

		// The percent threshold to optimize memory intrinsic calls.
		static cl::opt<unsigned>
		MemOPPercentThreshold("memop-percent-threshold", cl::init(66), cl::Hidden,
		cl::ZeroOrMore,
		cl::desc("The percentage threshold for the optimize "
		" memory intrinsic calls"));

		// Maximum number of versions for optimizing memory intrinsic call.
		static cl::opt<unsigned>
		MemOPMaxVersion("memop-max-version", cl::init(3), cl::Hidden,
		cl::ZeroOrMore,
		cl::desc("The max version for the optimize memory "
		" intrinsic calls"));

namespace {		namespace {

/// The select instruction visitor plays three roles specified		/// The select instruction visitor plays three roles specified
/// by the mode. In \c VM_counting mode, it simply counts the number of		/// by the mode. In \c VM_counting mode, it simply counts the number of
/// select instructions. In \c VM_instrument mode, it inserts code to count		/// select instructions. In \c VM_instrument mode, it inserts code to count
/// the number times TrueValue of select is taken. In \c VM_annotate mode,		/// the number times TrueValue of select is taken. In \c VM_annotate mode,
/// it reads the profile data and annotate the select instruction with metadata.		/// it reads the profile data and annotate the select instruction with metadata.
enum VisitMode { VM_counting, VM_instrument, VM_annotate, VM_optimize };		enum VisitMode { VM_counting, VM_instrument, VM_annotate, VM_optimize };
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	struct MemIntrinsicVisitor : public InstVisitor<MemIntrinsicVisitor> {
Function &F;		Function &F;
unsigned NMemIs = 0; // Number of memIntrinsics instrumented.		unsigned NMemIs = 0; // Number of memIntrinsics instrumented.
VisitMode Mode = VM_counting; // Visiting mode.		VisitMode Mode = VM_counting; // Visiting mode.
unsigned CurCtrId = 0; // Current counter index.		unsigned CurCtrId = 0; // Current counter index.
unsigned TotalNumCtrs = 0; // Total number of counters		unsigned TotalNumCtrs = 0; // Total number of counters
GlobalVariable *FuncNameVar = nullptr;		GlobalVariable *FuncNameVar = nullptr;
uint64_t FuncHash = 0;		uint64_t FuncHash = 0;
PGOUseFunc *UseFunc = nullptr;		PGOUseFunc *UseFunc = nullptr;
		std::vector<MemIntrinsic *> Candidates;

MemIntrinsicVisitor(Function &Func) : F(Func) {}		MemIntrinsicVisitor(Function &Func) : F(Func) {}

void countMemIntrinsics(Function &Func) {		void countMemIntrinsics(Function &Func) {
NMemIs = 0;		NMemIs = 0;
Mode = VM_counting;		Mode = VM_counting;
visit(Func);		visit(Func);
}		}
▲ Show 20 Lines • Show All 384 Lines • ▼ Show 20 Lines	Builder.CreateCall(
{llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),		{llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
Builder.getInt64(FuncInfo.FunctionHash),		Builder.getInt64(FuncInfo.FunctionHash),
Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()),		Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()),
Builder.getInt32(llvm::InstrProfValueKind::IPVK_IndirectCallTarget),		Builder.getInt32(llvm::InstrProfValueKind::IPVK_IndirectCallTarget),
Builder.getInt32(NumIndirectCallSites++)});		Builder.getInt32(NumIndirectCallSites++)});
}		}
NumOfPGOICall += NumIndirectCallSites;		NumOfPGOICall += NumIndirectCallSites;

// Now instrument memop instrinsic calls:		// Now instrument memop intrinsic calls:
FuncInfo.MIVisitor.instrumentMemIntrinsics(F, NumCounters,		FuncInfo.MIVisitor.instrumentMemIntrinsics(F, NumCounters,
FuncInfo.FuncNameVar,		FuncInfo.FuncNameVar,
FuncInfo.FunctionHash);		FuncInfo.FunctionHash);
}		}

// This class represents a CFG edge in profile use compilation.		// This class represents a CFG edge in profile use compilation.
struct PGOUseEdge : public PGOEdge {		struct PGOUseEdge : public PGOEdge {
bool CountValid;		bool CountValid;
▲ Show 20 Lines • Show All 471 Lines • ▼ Show 20 Lines	Builder.CreateCall(
Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_range_profile),		Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_range_profile),
{llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),		{llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
Builder.getInt64(FuncHash), Builder.CreatePtrToInt(Length, Int64Ty),		Builder.getInt64(FuncHash), Builder.CreatePtrToInt(Length, Int64Ty),
Builder.getInt32(llvm::InstrProfValueKind::IPVK_MemOPSize),		Builder.getInt32(llvm::InstrProfValueKind::IPVK_MemOPSize),
Builder.getInt32(CurCtrId)});		Builder.getInt32(CurCtrId)});
++CurCtrId;		++CurCtrId;
}		}

		void MemIntrinsicVisitor::optimizeMemIntrinsics(Function &Func, PGOUseFunc *UF) {
		countMemIntrinsics(Func);
		auto NumMemOPCalls = getNumOfMemIntrinsics();
		unsigned NumValueSites = UF->getProfileRecord().getNumValueSites(IPVK_MemOPSize);
		if (NumValueSites != NumMemOPCalls) {
		std::string Msg =
		std::string("Inconsistent number of memory intrinsic calls: ") +
		Func.getName().str();
		auto &Ctx = Func.getContext();
		Ctx.diagnose(
		DiagnosticInfoPGOProfile(Func.getParent()->getName().data(), Msg, DS_Warning));
		return;
		}

		// Now optimize memory intrinsic calls.
		Candidates.clear();
		Mode = VM_optimize;
		UseFunc = UF;
		visit(Func);
		for (auto &I : Candidates)
		optimizeOneMemIntrinsic(*I);
		}

		static const char *getMIName(const MemIntrinsic &MI) {
		switch (MI.getIntrinsicID()) {
		case Intrinsic::memcpy:
		return "memcpy";
		case Intrinsic::memmove:
		return "memmove";
		case Intrinsic::memset:
		return "memset";
		default:
		return "unknown";
		}
		}
		void MemIntrinsicVisitor::optimizeOneMemIntrinsic(MemIntrinsic &MI) {
		const InstrProfRecord &InstrProfRecord = UseFunc->getProfileRecord();
		unsigned Index = CurCtrId++;
		uint32_t NV = InstrProfRecord.getNumValueDataForSite(IPVK_MemOPSize, Index);
		if (!NV)
		return;

		if (MI.getIntrinsicID() == Intrinsic::memmove)
		return;

		uint64_t Sum = 0;
		std::unique_ptr<InstrProfValueData[]> VD =
		InstrProfRecord.getValueForSite(IPVK_MemOPSize, Index, &Sum);

		if (Sum < MemOPCountThreshold)
		return;

		ArrayRef<InstrProfValueData> VDs(VD.get(), NV);

		uint64_t SumForSmallSizes = 0;
		uint64_t Threshold = Sum / 100;
		SmallVector<uint64_t, 16> SizeIds;
		SmallVector<uint64_t, 16> CaseCounts;
		uint64_t MaxCount = 0;
		unsigned Version = 0;
		// Default case is in the front -- save the slot here.
		CaseCounts.push_back(0);
		for (auto &VD : VDs) {
		int64_t V = VD.Value;
		uint64_t C = VD.Count;
		// only count these greater than 1% of the Sum;
		if (V > 0 && C > Threshold) {
		SumForSmallSizes += C;
		SizeIds.push_back(V);
		CaseCounts.push_back(C);
		if (C > MaxCount)
		MaxCount = C;
		if (++Version > MemOPMaxVersion && MemOPMaxVersion !=0)
		break;
		}
		}
		// Add count for the default case.
		uint64_t DC = Sum - SumForSmallSizes;
		CaseCounts[0] = DC;
		if (DC > MaxCount)
		MaxCount = DC;

		DEBUG(dbgs() << "Read one memory intrinsic profile: Index=" << Index
		<< ": " << SumForSmallSizes << " vs " << Sum << "\n");
		DEBUG(for (auto &VD : VDs) { dbgs() << " (" << VD.Value << ","
		<< VD.Count << ")\n"; });

		uint64_t CountThreshold = Sum / 100 * MemOPPercentThreshold;
		if (Sum < 100)
		CountThreshold = Sum * MemOPPercentThreshold /100;
		if (SumForSmallSizes <= CountThreshold)
		return;

		DEBUG(dbgs() << "Optimize one memory intrinsic call\n");

		// mem_op(..., size)
		// ==>
		// switch (size) {
		// case s1:
		// mem_op(..., s1);
		// goto merge_bb;
		// case s2:
		// mem_op(..., s2);
		// goto merge_bb;
		// ...
		// default:
		// mem_op(..., size);
		// goto merge_bb;
		// }
		// merge_bb:

		BasicBlock *BB = MI.getParent();
		DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
		DEBUG(dbgs() << *BB << "\n");

		BasicBlock *DefaultBB = SplitBlock(BB, &MI);
		BasicBlock::iterator It(MI);
		++It;
		assert(It != DefaultBB->end());
		BasicBlock MergeBB = SplitBlock(DefaultBB, &(It));
		DefaultBB->setName("MemOP.Default");
		MergeBB->setName("MemOP.Merge");

		auto &Ctx = F.getContext();
		IRBuilder<> IRB(BB);
		BB->getTerminator()->eraseFromParent();
		Value *SizeVar = MI.getLength();
		SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());

		DEBUG(dbgs() << "\n\n== Basic Block After==\n");

		for (uint64_t SizeId: SizeIds) {
		ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId);
		BasicBlock *CaseBB = BasicBlock::Create(Ctx,
		Twine("MemOP.Case.") + Twine(SizeId),
		&F, DefaultBB);
		Instruction *NewInst = MI.clone();
		// Fix the argument.
		dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId);
		CaseBB->getInstList().push_back(NewInst);
		IRBuilder<> IRBCase(CaseBB);
		IRBCase.CreateBr(MergeBB);
		SI->addCase(CaseSizeId, CaseBB);
		DEBUG(dbgs() << *CaseBB << "\n");
		}
		setProfMetadata(F.getParent(), SI, CaseCounts, MaxCount);

		DEBUG(dbgs() << *BB << "\n");
		DEBUG(dbgs() << *DefaultBB << "\n");
		DEBUG(dbgs() << *MergeBB << "\n");

		emitOptimizationRemark(
		F.getContext(), "memop-opt", F, MI.getDebugLoc(),
		Twine("optimize ") + getMIName(MI) + " with count " +
		Twine(SumForSmallSizes) + " out of " + Twine(Sum) + " for " +
		Twine(Version) + " versions");

		}

void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) {		void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) {
if (!PGOInstrMemOP)		if (!PGOInstrMemOP)
return;		return;
Value *Length = MI.getLength();		Value *Length = MI.getLength();
// Not instrument constant length calls.		// Not instrument constant length calls.
if (dyn_cast<ConstantInt>(Length))		if (dyn_cast<ConstantInt>(Length))
return;		return;

NMemIs++;		NMemIs++;
switch (Mode) {		switch (Mode) {
case VM_counting:		case VM_counting:
return;		return;
case VM_instrument:		case VM_instrument:
instrumentOneMemIntrinsic(MI);		instrumentOneMemIntrinsic(MI);
return;		return;
		case VM_optimize:
		Candidates.push_back(&MI);
		return;
default:		default:
break;		break;
}		}
llvm_unreachable("Unknown visiting mode");		llvm_unreachable("Unknown visiting mode");
}		}

// Traverse all the indirect callsites and annotate the instructions.		// Traverse all the indirect callsites and annotate the instructions.
void PGOUseFunc::annotateIndirectCallSites() {		void PGOUseFunc::annotateIndirectCallSites() {
Show All 20 Lines	void PGOUseFunc::annotateIndirectCallSites() {
for (auto &I : IndirectCallSites) {		for (auto &I : IndirectCallSites) {
DEBUG(dbgs() << "Read one indirect call instrumentation: Index="		DEBUG(dbgs() << "Read one indirect call instrumentation: Index="
<< IndirectCallSiteIndex << " out of " << NumValueSites		<< IndirectCallSiteIndex << " out of " << NumValueSites
<< "\n");		<< "\n");
annotateValueSite(M, I, ProfileRecord, IPVK_IndirectCallTarget,		annotateValueSite(M, I, ProfileRecord, IPVK_IndirectCallTarget,
IndirectCallSiteIndex, MaxNumAnnotations);		IndirectCallSiteIndex, MaxNumAnnotations);
IndirectCallSiteIndex++;		IndirectCallSiteIndex++;
}		}

		// Now optimize memory intrinsic calls.
		FuncInfo.MIVisitor.optimizeMemIntrinsics(F, this);
}		}
} // end anonymous namespace		} // end anonymous namespace

// Create a COMDAT variable INSTR_PROF_RAW_VERSION_VAR to make the runtime		// Create a COMDAT variable INSTR_PROF_RAW_VERSION_VAR to make the runtime
// aware this is an ir_level profile so it can set the version flag.		// aware this is an ir_level profile so it can set the version flag.
static void createIRLevelProfileFlagVariable(Module &M) {		static void createIRLevelProfileFlagVariable(Module &M) {
Type *IntTy64 = Type::getInt64Ty(M.getContext());		Type *IntTy64 = Type::getInt64Ty(M.getContext());
uint64_t ProfileVersion = (INSTR_PROF_RAW_VERSION \| VARIANT_MASK_IR_PROF);		uint64_t ProfileVersion = (INSTR_PROF_RAW_VERSION \| VARIANT_MASK_IR_PROF);
▲ Show 20 Lines • Show All 179 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[PGO] Memory intrinsic calls optimization based on profiled size
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 85180

lib/Transforms/Instrumentation/PGOInstrumentation.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[PGO] Memory intrinsic calls optimization based on profiled sizeClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 85180

lib/Transforms/Instrumentation/PGOInstrumentation.cpp

[PGO] Memory intrinsic calls optimization based on profiled size
ClosedPublic