Index: lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp =================================================================== --- lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp +++ lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp @@ -15,8 +15,14 @@ #include "PPC.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -24,6 +30,7 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #define DEBUG_TYPE "ppc-lower-mem-intrinsics" @@ -34,7 +41,28 @@ // enabled with 'ppc-expand-extra-memcpy=true'. A follow on patch will refine // the expansions based on profiling data. -STATISTIC(MemCpyLoopExpansions, "Number of memcpy calls expanded into a loop."); +STATISTIC(MemCpyCalls, "Total number of memcpy calls found."); +STATISTIC(MemCpyLoopNotExpanded, "Total number of memcpy calls not expanded."); +STATISTIC(MemCpyLoopExpansions, + "Total number of memcpy calls expanded into a loop."); +STATISTIC(MemCpyKnownSizeCalls, + "Total Number of known size memcpy calls found."); +STATISTIC(MemCpyUnknownSizeCalls, + "Total Number of unknown size memcpy calls found."); +STATISTIC(MemCpyVersioned, "Number of unknown size memcpy calls versioned."); +STATISTIC(MemCpyKnownSizeExpanded, + "Number of known size memcpy calls expanded into a loop."); +STATISTIC(MemCpyLTMemcpyLoopFloor, "Number of memcpy calls not expanded into a " + "loop because size lt MemcpyLoopFloor."); +STATISTIC(MemCpyGTMemcpyLoopCeil, "Number of memcpy calls not expanded into a " + "loop because size gt MemcpyLoopCeil."); +STATISTIC( + MemCpyPgoCold, + "Number of memcpy calls not expanded into a loop due to pgo cold path."); +STATISTIC(MemCpyMinSize, "Number of memcpy calls not expanded into a loop " + "cause it's compiling for min size or opt-none."); +STATISTIC(MemCpyNoTargetCPU, "Number of memcpy calls not expanded into a loop " + "because target cpu is not as expected."); using namespace llvm; @@ -62,8 +90,10 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); } + bool shouldExpandMemCpy(MemCpyInst *MC); bool runOnModule(Module &M) override; /// Loops over all uses of llvm.memcpy and expands the call if warranted. // \p MemcpyDecl is the function declaration of llvm.memcpy. @@ -77,8 +107,11 @@ char PPCLowerMemIntrinsics::ID = 0; -INITIALIZE_PASS(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics", - "Lower mem intrinsics into loops", false, false) +INITIALIZE_PASS_BEGIN(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics", + "Lower mem intrinsics into loops", false, false) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_END(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics", + "Lower mem intrinsics into loops", false, false) // Checks whether the cpu arch is one where we want to expand // memcpy calls. We expand for little-endian PPC cpus. @@ -91,30 +124,75 @@ } // Determines if we want to expand a specific memcpy call. -static bool shouldExpandMemCpy(MemCpyInst *MC) { +bool PPCLowerMemIntrinsics::shouldExpandMemCpy(MemCpyInst *MC) { // If compiling for -O0, -Oz or -Os we don't want to expand. Function *ParentFunc = MC->getParent()->getParent(); if (ParentFunc->optForSize() || - ParentFunc->hasFnAttribute(Attribute::OptimizeNone)) + ParentFunc->hasFnAttribute(Attribute::OptimizeNone)) { + ++MemCpyMinSize; return false; + } // See if the cpu arch is one we want to expand for. If there is no // target-cpu attibute assume we don't want to expand. Attribute CPUAttr = ParentFunc->getFnAttribute("target-cpu"); if (CPUAttr.hasAttribute(Attribute::None) || !CPUCheck(CPUAttr.getValueAsString())) { + ++MemCpyNoTargetCPU; return false; } - // Expand known sizes within the range [MemcpyLoopFloor, MemcpyLoopCeil]. + // Check if it is a memcpy call with known size ConstantInt *CISize = dyn_cast(MC->getLength()); + if (CISize) + ++MemCpyKnownSizeCalls; + else + ++MemCpyUnknownSizeCalls; + + // Do not expand cold call sites based on profiling information + ProfileSummaryInfo *PSI = + getAnalysis().getPSI(); + bool hasPGOInfo = false; + if (PSI) { + DominatorTree DT(*ParentFunc); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*ParentFunc, LI); + BlockFrequencyInfo BFI(*ParentFunc, BPI, LI); + + Optional Count = PSI->getProfileCount(MC, &BFI); + if (Count.hasValue()) { + hasPGOInfo = true; + if (PSI->isColdCallSite(CallSite(MC), &BFI)) { + ++MemCpyPgoCold; + return false; + } + } + } + + // Expand known sizes within the range [MemcpyLoopFloor, MemcpyLoopCeil]. if (CISize) { - return CISize->getZExtValue() >= MemcpyLoopFloor && - CISize->getZExtValue() <= MemcpyLoopCeil; + if (CISize->getZExtValue() > MemcpyLoopCeil) { + ++MemCpyGTMemcpyLoopCeil; + return false; + } else if (CISize->getZExtValue() < MemcpyLoopFloor) { + ++MemCpyLTMemcpyLoopFloor; + return false; + } + return true; } - // Otherwise expand unkown sizes ... - return true; + // For unknown size, only version if there is PGO info + return (hasPGOInfo); +} + +// returns condition to be used to determine unknown size memCpy expansion +static Value *getExpandUnknownSizeMemCpyCond(MemCpyInst *MI) { + + IRBuilder<> Builder(MI); + Value *Op1 = MI->getLength(); + Value *Op2 = ConstantInt::get(Op1->getType(), MemcpyLoopCeil); + Value *Cond = Builder.CreateICmpULE(Op1, Op2); + return Cond; } // Wrapper function that determines which expansion to call depending on if the @@ -126,11 +204,23 @@ createMemCpyLoopKnownSize(MI, MI->getRawSource(), MI->getRawDest(), ConstLen, MI->getAlignment(), MI->getAlignment(), MI->isVolatile(), MI->isVolatile(), TTI); + ++MemCpyKnownSizeExpanded; } else { - createMemCpyLoopUnknownSize(MI, MI->getRawSource(), MI->getRawDest(), + // create if-then-else block and insert before memCpy instruction + TerminatorInst *ThenTerm, *ElseTerm; + SplitBlockAndInsertIfThenElse(getExpandUnknownSizeMemCpyCond(MI), + MI, &ThenTerm, &ElseTerm, nullptr); + // Generate memCpy expansion loop in then-block + createMemCpyLoopUnknownSize(ThenTerm, MI->getRawSource(), MI->getRawDest(), MI->getLength(), MI->getAlignment(), MI->getAlignment(), MI->isVolatile(), MI->isVolatile(), TTI); + + // create a copy of MI and instert to else-block + IRBuilder<> Builder(MI); + Builder.SetInsertPoint(ElseTerm); + Builder.Insert(MI->clone()); + ++MemCpyVersioned; } } @@ -141,13 +231,16 @@ for (auto I : F.users()) { MemCpyInst *MC = dyn_cast(I); assert(MC && "Must be a MemcpyInst!"); + ++MemCpyCalls; if (shouldExpandMemCpy(MC)) { const TargetTransformInfo &TTI = getAnalysis().getTTI(F); ppcExpandMemCpyAsLoop(MC, TTI); MC->eraseFromParent(); AnyExpanded = true; - MemCpyLoopExpansions += 1; + ++MemCpyLoopExpansions; + } else { + ++MemCpyLoopNotExpanded; } } return AnyExpanded; Index: test/CodeGen/PowerPC/memcpy-loop-expansion.ll =================================================================== --- test/CodeGen/PowerPC/memcpy-loop-expansion.ll +++ test/CodeGen/PowerPC/memcpy-loop-expansion.ll @@ -86,13 +86,17 @@ ; Check the expansion of a memcpy whose size argument is not a compile time ; constant. -define i8* @memcpy_unkown_size(i8* %dst, i8* %src, i64 %len) { +define i8* @memcpy_unkown_size(i8* %dst, i8* %src, i64 %len) !prof !29 { entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false) ret i8* %dst ; OPT-LABEL: @memcpy_unkown_size ; OPT: entry: +; OPT-NEXT: [[SizeCmp:%[0-9]+]] = icmp ule i64 %len, 256 +; OPT-NEXT: br i1 %0, label %[[ExpLabel:[0-9]+]], label %[[NoExpLabel:[0-9]+]] + +; OPT: