Index: lib/Transforms/Instrumentation/PGOInstrumentation.cpp =================================================================== --- lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -149,6 +149,27 @@ static cl::opt PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden); +// The minimum call count to optimize memory intrinsic calls. +static cl::opt + MemOPCountThreshold("memop-count-threshold", cl::Hidden, cl::ZeroOrMore, + cl::init(1000), + cl::desc("The minimum count to optimize memory " + "intrinsic calls")); + +// The percent threshold to optimize memory intrinsic calls. +static cl::opt + MemOPPercentThreshold("memop-percent-threshold", cl::init(66), cl::Hidden, + cl::ZeroOrMore, + cl::desc("The percentage threshold for the optimize " + " memory intrinsic calls")); + +// Maximum number of versions for optimizing memory intrinsic call. +static cl::opt + MemOPMaxVersion("memop-max-version", cl::init(3), cl::Hidden, + cl::ZeroOrMore, + cl::desc("The max version for the optimize memory " + " intrinsic calls")); + namespace { /// The select instruction visitor plays three roles specified @@ -215,6 +236,7 @@ GlobalVariable *FuncNameVar = nullptr; uint64_t FuncHash = 0; PGOUseFunc *UseFunc = nullptr; + std::vector Candidates; MemIntrinsicVisitor(Function &Func) : F(Func) {} @@ -615,7 +637,7 @@ } NumOfPGOICall += NumIndirectCallSites; - // Now instrument memop instrinsic calls: + // Now instrument memop intrinsic calls: FuncInfo.MIVisitor.instrumentMemIntrinsics(F, NumCounters, FuncInfo.FuncNameVar, FuncInfo.FunctionHash); @@ -1103,6 +1125,165 @@ ++CurCtrId; } +void MemIntrinsicVisitor::optimizeMemIntrinsics(Function &Func, PGOUseFunc *UF) { + countMemIntrinsics(Func); + auto NumMemOPCalls = getNumOfMemIntrinsics(); + unsigned NumValueSites = UF->getProfileRecord().getNumValueSites(IPVK_MemOPSize); + if (NumValueSites != NumMemOPCalls) { + std::string Msg = + std::string("Inconsistent number of memory intrinsic calls: ") + + Func.getName().str(); + auto &Ctx = Func.getContext(); + Ctx.diagnose( + DiagnosticInfoPGOProfile(Func.getParent()->getName().data(), Msg, DS_Warning)); + return; + } + + // Now optimize memory intrinsic calls. + Candidates.clear(); + Mode = VM_optimize; + UseFunc = UF; + visit(Func); + for (auto &I : Candidates) + optimizeOneMemIntrinsic(*I); +} + +static const char *getMIName(const MemIntrinsic &MI) { + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + return "memcpy"; + case Intrinsic::memmove: + return "memmove"; + case Intrinsic::memset: + return "memset"; + default: + return "unknown"; + } +} +void MemIntrinsicVisitor::optimizeOneMemIntrinsic(MemIntrinsic &MI) { + const InstrProfRecord &InstrProfRecord = UseFunc->getProfileRecord(); + unsigned Index = CurCtrId++; + uint32_t NV = InstrProfRecord.getNumValueDataForSite(IPVK_MemOPSize, Index); + if (!NV) + return; + + if (MI.getIntrinsicID() == Intrinsic::memmove) + return; + + uint64_t Sum = 0; + std::unique_ptr VD = + InstrProfRecord.getValueForSite(IPVK_MemOPSize, Index, &Sum); + + if (Sum < MemOPCountThreshold) + return; + + ArrayRef VDs(VD.get(), NV); + + uint64_t SumForSmallSizes = 0; + uint64_t Threshold = Sum / 100; + SmallVector SizeIds; + SmallVector CaseCounts; + uint64_t MaxCount = 0; + unsigned Version = 0; + // Default case is in the front -- save the slot here. + CaseCounts.push_back(0); + for (auto &VD : VDs) { + int64_t V = VD.Value; + uint64_t C = VD.Count; + // only count these greater than 1% of the Sum; + if (V > 0 && C > Threshold) { + SumForSmallSizes += C; + SizeIds.push_back(V); + CaseCounts.push_back(C); + if (C > MaxCount) + MaxCount = C; + if (++Version > MemOPMaxVersion && MemOPMaxVersion !=0) + break; + } + } + // Add count for the default case. + uint64_t DC = Sum - SumForSmallSizes; + CaseCounts[0] = DC; + if (DC > MaxCount) + MaxCount = DC; + + DEBUG(dbgs() << "Read one memory intrinsic profile: Index=" << Index + << ": " << SumForSmallSizes << " vs " << Sum << "\n"); + DEBUG(for (auto &VD : VDs) { dbgs() << " (" << VD.Value << "," + << VD.Count << ")\n"; }); + + uint64_t CountThreshold = Sum / 100 * MemOPPercentThreshold; + if (Sum < 100) + CountThreshold = Sum * MemOPPercentThreshold /100; + if (SumForSmallSizes <= CountThreshold) + return; + + DEBUG(dbgs() << "Optimize one memory intrinsic call\n"); + + // mem_op(..., size) + // ==> + // switch (size) { + // case s1: + // mem_op(..., s1); + // goto merge_bb; + // case s2: + // mem_op(..., s2); + // goto merge_bb; + // ... + // default: + // mem_op(..., size); + // goto merge_bb; + // } + // merge_bb: + + BasicBlock *BB = MI.getParent(); + DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); + DEBUG(dbgs() << *BB << "\n"); + + BasicBlock *DefaultBB = SplitBlock(BB, &MI); + BasicBlock::iterator It(MI); + ++It; + assert(It != DefaultBB->end()); + BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It)); + DefaultBB->setName("MemOP.Default"); + MergeBB->setName("MemOP.Merge"); + + auto &Ctx = F.getContext(); + IRBuilder<> IRB(BB); + BB->getTerminator()->eraseFromParent(); + Value *SizeVar = MI.getLength(); + SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); + + DEBUG(dbgs() << "\n\n== Basic Block After==\n"); + + for (uint64_t SizeId: SizeIds) { + ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId); + BasicBlock *CaseBB = BasicBlock::Create(Ctx, + Twine("MemOP.Case.") + Twine(SizeId), + &F, DefaultBB); + Instruction *NewInst = MI.clone(); + // Fix the argument. + dyn_cast(NewInst)->setLength(CaseSizeId); + CaseBB->getInstList().push_back(NewInst); + IRBuilder<> IRBCase(CaseBB); + IRBCase.CreateBr(MergeBB); + SI->addCase(CaseSizeId, CaseBB); + DEBUG(dbgs() << *CaseBB << "\n"); + } + setProfMetadata(F.getParent(), SI, CaseCounts, MaxCount); + + DEBUG(dbgs() << *BB << "\n"); + DEBUG(dbgs() << *DefaultBB << "\n"); + DEBUG(dbgs() << *MergeBB << "\n"); + + emitOptimizationRemark( + F.getContext(), "memop-opt", F, MI.getDebugLoc(), + Twine("optimize ") + getMIName(MI) + " with count " + + Twine(SumForSmallSizes) + " out of " + Twine(Sum) + " for " + + Twine(Version) + " versions"); + +} + void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) { if (!PGOInstrMemOP) return; @@ -1118,6 +1299,9 @@ case VM_instrument: instrumentOneMemIntrinsic(MI); return; + case VM_optimize: + Candidates.push_back(&MI); + return; default: break; } @@ -1154,6 +1338,9 @@ IndirectCallSiteIndex, MaxNumAnnotations); IndirectCallSiteIndex++; } + + // Now optimize memory intrinsic calls. + FuncInfo.MIVisitor.optimizeMemIntrinsics(F, this); } } // end anonymous namespace