Index: lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp =================================================================== --- lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp +++ lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp @@ -15,8 +15,14 @@ #include "PPC.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -24,6 +30,7 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #define DEBUG_TYPE "ppc-memcpy-loop-lowering" @@ -33,14 +40,35 @@ // currently expanding all call sites. The pass is off by default and can be // enabled with 'ppc-enable-memcpy-loops=true'. -STATISTIC(MemCpyLoopExpansions, "Number of memcpy calls expanded into a loop."); +STATISTIC(MemCpyCalls, "Total number of memcpy calls found."); +STATISTIC(MemCpyLoopNotExpanded, "Total number of memcpy calls not expanded."); +STATISTIC(MemCpyLoopExpansions, + "Total number of memcpy calls expanded into a loop."); +STATISTIC(MemCpyKnownSizeCalls, + "Total Number of known size memcpy calls found."); +STATISTIC(MemCpyUnknownSizeCalls, + "Total Number of unknown size memcpy calls found."); +STATISTIC(MemCpyVersioned, "Number of unknown size memcpy calls versioned."); +STATISTIC(MemCpyKnownSizeExpanded, + "Number of known size memcpy calls expanded into a loop."); +STATISTIC(MemCpyLTMemcpyLoopFloor, "Number of memcpy calls not expanded into a " + "loop because size lt MemcpyLoopFloor."); +STATISTIC(MemCpyGTMemcpyLoopCeil, "Number of memcpy calls not expanded into a " + "loop because size gt MemcpyLoopCeil."); +STATISTIC( + MemCpyPgoCold, + "Number of memcpy calls not expanded into a loop due to pgo cold path."); +STATISTIC(MemCpyMinSize, "Number of memcpy calls not expanded into a loop " + "cause it's compiling for min size or opt-none."); +STATISTIC(MemCpyNoTargetCPU, "Number of memcpy calls not expanded into a loop " + "because target cpu is not as expected."); using namespace llvm; static cl::opt EnableMemcpyExpansionPass( "ppc-enable-memcpy-loops", cl::desc("Enable the PPC pass that lowers memcpy calls into loops."), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); // Options used to tune the size range where memcpy expansions occur. static cl::opt MemcpyLoopFloor( @@ -49,9 +77,28 @@ "The lower size bound of memcpy calls to get expanded into a loop")); static cl::opt MemcpyLoopCeil( - "ppc-memcpy-loop-ceil", cl::Hidden, cl::init(256), + "ppc-memcpy-loop-ceil", cl::Hidden, cl::init(512), cl::desc("The upper size bound of memcpy calls to get expanded in a loop")); +static cl::opt MemcpyLoopUnknownThreshold( + "ppc-memcpy-loop-unknown-threshold", cl::Hidden, cl::init(128), + cl::desc("The upper size bound of memcpy calls to get expanded in a loop for unknown sizes")); + +static cl::opt MemcpyLoopDoKnown( + "ppc-memcpy-known-loops", + cl::desc("Enable memcpy loop expansion for known size loops."), + cl::init(true), cl::Hidden); + +static cl::opt MemcpyLoopDoUnknown( + "ppc-memcpy-unknown-loops", + cl::desc("Enable memcpy loop expansion for unknown size loops."), + cl::init(false), cl::Hidden); + +static cl::opt MemcpyLoopDoUnknownNonPGO( + "ppc-memcpy-non-pgo-unknown-loops", + cl::desc("Enable memcpy loop expansion for unknown size loops even without PGO information."), + cl::init(false), cl::Hidden); + namespace { class PPCLowerMemIntrinsics : public ModulePass { public: @@ -61,8 +108,10 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); } + bool shouldExpandMemCpy(MemCpyInst *MC); bool runOnModule(Module &M) override; /// Loops over all uses of llvm.memcpy and expands the call if warranted. // \p MemcpyDecl is the function declaration of llvm.memcpy. @@ -74,7 +123,6 @@ }; } // end anonymous namespace - // Checks whether the cpu arch is one where we want to expand // memcpy calls. static bool CPUCheck(const std::string &CpuStr) { @@ -86,30 +134,85 @@ } // Determines if we want to expand a specific memcpy call. -static bool shouldExpandMemCpy(MemCpyInst *MC) { +bool PPCLowerMemIntrinsics::shouldExpandMemCpy(MemCpyInst *MC) { // If compiling for -O0, -Oz or -Os we don't want to expand. Function *ParentFunc = MC->getParent()->getParent(); if (ParentFunc->optForSize() || - ParentFunc->hasFnAttribute(Attribute::OptimizeNone)) + ParentFunc->hasFnAttribute(Attribute::OptimizeNone)) { + ++MemCpyMinSize; return false; + } // See if the cpu arch is one we want to expand for. If there is no // target-cpu attibute assume we don't want to expand. Attribute CPUAttr = ParentFunc->getFnAttribute("target-cpu"); if (CPUAttr.hasAttribute(Attribute::None) || !CPUCheck(CPUAttr.getValueAsString())) { + ++MemCpyNoTargetCPU; return false; } - // Expand known sizes within the range [MemcpyLoopFloor, MemcpyLoopCeil]. + // Check if it is a memcpy call with known size ConstantInt *CISize = dyn_cast(MC->getLength()); if (CISize) { - return CISize->getZExtValue() >= MemcpyLoopFloor && - CISize->getZExtValue() <= MemcpyLoopCeil; + if (!MemcpyLoopDoKnown) { + ++MemCpyMinSize; + return false; + } + ++MemCpyKnownSizeCalls; + } + else { + if (!MemcpyLoopDoUnknown) { + ++MemCpyMinSize; + return false; + } + ++MemCpyUnknownSizeCalls; + } + + // Do not expand cold call sites based on profiling information + ProfileSummaryInfo *PSI = + getAnalysis().getPSI(); + bool hasPGOInfo = false; + if (PSI) { + DominatorTree DT(*ParentFunc); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*ParentFunc, LI); + BlockFrequencyInfo BFI(*ParentFunc, BPI, LI); + + Optional Count = PSI->getProfileCount(MC, &BFI); + if (Count.hasValue()) { + hasPGOInfo = true; + if (PSI->isColdCallSite(CallSite(MC), &BFI)) { + ++MemCpyPgoCold; + return false; + } + } + } + + // Expand known sizes within the range [MemcpyLoopFloor, MemcpyLoopCeil]. + if (CISize) { + if (CISize->getZExtValue() > MemcpyLoopCeil) { + ++MemCpyGTMemcpyLoopCeil; + return false; + } else if (CISize->getZExtValue() < MemcpyLoopFloor) { + ++MemCpyLTMemcpyLoopFloor; + return false; + } + return true; } - // Otherwise expand unkown sizes ... - return true; + // For unknown size, only version if there is PGO info + return (hasPGOInfo || MemcpyLoopDoUnknownNonPGO); +} + +// returns condition to be used to determine unknown size memCpy expansion +static Value *getExpandUnknownSizeMemCpyCond(MemCpyInst *MI) { + + IRBuilder<> Builder(MI); + Value *Op1 = MI->getLength(); + Value *Op2 = ConstantInt::get(Op1->getType(), MemcpyLoopUnknownThreshold); + Value *Cond = Builder.CreateICmpULE(Op1, Op2); + return Cond; } // Wrapper function that determines which expansion to call depending on if the @@ -121,11 +224,23 @@ createMemCpyLoopKnownSize(MI, MI->getRawSource(), MI->getRawDest(), ConstLen, MI->getAlignment(), MI->getAlignment(), MI->isVolatile(), MI->isVolatile(), TTI); + ++MemCpyKnownSizeExpanded; } else { - createMemCpyLoopUnknownSize(MI, MI->getRawSource(), MI->getRawDest(), + // create if-then-else block and insert before memCpy instruction + TerminatorInst *ThenTerm, *ElseTerm; + SplitBlockAndInsertIfThenElse(getExpandUnknownSizeMemCpyCond(MI), + MI, &ThenTerm, &ElseTerm, nullptr); + // Generate memCpy expansion loop in then-block + createMemCpyLoopUnknownSize(ThenTerm, MI->getRawSource(), MI->getRawDest(), MI->getLength(), MI->getAlignment(), MI->getAlignment(), MI->isVolatile(), MI->isVolatile(), TTI); + + // create a copy of MI and instert to else-block + IRBuilder<> Builder(MI); + Builder.SetInsertPoint(ElseTerm); + Builder.Insert(MI->clone()); + ++MemCpyVersioned; } } @@ -137,6 +252,7 @@ for (auto I : F.users()) { MemCpyInst *MC = dyn_cast(I); assert(MC && "Must be a MemcpyInst!"); + ++MemCpyCalls; if (shouldExpandMemCpy(MC)) { Function *ParentFunc = MC->getParent()->getParent(); const TargetTransformInfo &TTI = @@ -144,7 +260,9 @@ ppcExpandMemCpyAsLoop(MC, TTI); MC->eraseFromParent(); AnyExpanded = true; - MemCpyLoopExpansions += 1; + ++MemCpyLoopExpansions; + } else { + ++MemCpyLoopNotExpanded; } } return AnyExpanded; @@ -177,5 +295,8 @@ } char PPCLowerMemIntrinsics::ID = 0; -INITIALIZE_PASS(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics", - "Lower mem intrinsics into loops", false, false) +INITIALIZE_PASS_BEGIN(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics", + "Lower mem intrinsics into loops", false, false) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_END(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics", + "Lower mem intrinsics into loops", false, false) Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -483,7 +483,7 @@ Type *PPCTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAlign, unsigned DestAlign) const { - return Type::getInt64Ty(Context); + return VectorType::get(Type::getInt64Ty(Context),8); } /// Decomposes a copy operation with size \p RemainingBytes into the individual @@ -492,13 +492,16 @@ SmallVectorImpl &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const { // Types to use in copy operations. - IntegerType *CopyTypes[] = { + Type *CopyTypes[] = { + VectorType::get(Type::getInt64Ty(Context),2), Type::getInt64Ty(Context), Type::getInt32Ty(Context), Type::getInt16Ty(Context), Type::getInt8Ty(Context)}; // Deconstructs the remaining bytes into individual operands. for (auto OpTy : CopyTypes) { - unsigned OpSize = OpTy->getBitWidth() / 8; + unsigned OpSize = OpTy->getScalarSizeInBits() / 8; + if (OpTy->isVectorTy()) + OpSize *= OpTy->getVectorNumElements(); // Loops just in case the remaining bytes are greater or equal to // twice the largest copy operand type. while (RemainingBytes >= OpSize) { Index: lib/Transforms/Utils/LowerMemIntrinsics.cpp =================================================================== --- lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -98,7 +98,7 @@ SrcAlign = std::min(SrcAlign, LoopOpSize); DestAlign = std::min(DestAlign, LoopOpSize); - SmallVector RemainingOps; + SmallVector RemainingOps; TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, SrcAlign, DestAlign); Index: test/CodeGen/PowerPC/memcpy-loop-expansion.ll =================================================================== --- test/CodeGen/PowerPC/memcpy-loop-expansion.ll +++ test/CodeGen/PowerPC/memcpy-loop-expansion.ll @@ -1,10 +1,17 @@ ; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \ +; RUN: -ppc-memcpy-unknown-loops=true \ ; RUN: -mtriple=powerpc64le-unknown-linux-gnu -ppc-memcpy-loop-floor=0 \ ; RUN: -mcpu=pwr8 %s| FileCheck -check-prefix=OPT %s ; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \ +; RUN: -ppc-memcpy-unknown-loops=true \ ; RUN: -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 %s | \ ; RUN: FileCheck %s --check-prefix PWR7 +; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \ +; RUN: -ppc-memcpy-unknown-loops=true \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s | \ +; RUN: FileCheck %s --check-prefix OPTSMALL ; RUN: llc < %s -ppc-enable-memcpy-loops=true \ +; RUN: -ppc-memcpy-unknown-loops=true \ ; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -O0 | \ ; RUN: FileCheck %s --check-prefix OPTNONE @@ -56,65 +63,89 @@ ret i8* %dst ; OPT-LABEL: @memcpy_known_size ; OPT: entry: -; OPT-NEXT: [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64* -; OPT-NEXT: [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64* +; OPT-NEXT: [[SrcCast:%[0-9]+]] = bitcast i8* %src to <8 x i64>* +; OPT-NEXT: [[DstCast:%[0-9]+]] = bitcast i8* %dst to <8 x i64>* ; OPT-NEXT: br label %load-store-loop ; OPT: load-store-loop: ; OPT-NEXT: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ] -; OPT-NEXT: [[SrcGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[SrcCast]], i64 %loop-index -; OPT-NEXT: [[Load:%[0-9]+]] = load i64, i64* [[SrcGep]] -; OPT-NEXT: [[DstGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[DstCast]], i64 %loop-index -; OPT-NEXT: store i64 [[Load]], i64* [[DstGep]] +; OPT-NEXT: [[SrcGep:%[0-9]+]] = getelementptr inbounds <8 x i64>, <8 x i64>* [[SrcCast]], i64 %loop-index +; OPT-NEXT: [[Load:%[0-9]+]] = load <8 x i64>, <8 x i64>* [[SrcGep]] +; OPT-NEXT: [[DstGep:%[0-9]+]] = getelementptr inbounds <8 x i64>, <8 x i64>* [[DstCast]], i64 %loop-index +; OPT-NEXT: store <8 x i64> [[Load]], <8 x i64>* [[DstGep]] ; OPT-NEXT: [[IndexInc]] = add i64 %loop-index, 1 -; OPT-NEXT: [[CMP:%[0-9]+]] = icmp ult i64 [[IndexInc]], 12 +; OPT-NEXT: [[CMP:%[0-9]+]] = icmp ult i64 [[IndexInc]], 1 ; OPT-NEXT: br i1 [[CMP]], label %load-store-loop, label %memcpy-split ; OPT: memcpy-split: -; OPT-NEXT: [[SrcAsi32:%[0-9]+]] = bitcast i64* [[SrcCast]] to i32* -; OPT-NEXT: [[SrcGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 24 -; OPT-NEXT: [[Load2:%[0-9]+]] = load i32, i32* [[SrcGep2]] -; OPT-NEXT: [[DstAsi32:%[0-9]+]] = bitcast i64* [[DstCast]] to i32* -; OPT-NEXT: [[DstGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 24 -; OPT-NEXT: store i32 [[Load2]], i32* [[DstGep2]] +; OPT-NEXT: [[SrcAs2Xi64:%[0-9]+]] = bitcast <8 x i64>* [[SrcCast]] to <2 x i64>* +; OPT-NEXT: [[SrcGep2:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[SrcAs2Xi64]], i64 4 +; OPT-NEXT: [[Load2:%[0-9]+]] = load <2 x i64>, <2 x i64>* [[SrcGep2]] +; OPT-NEXT: [[DstAs2xi64:%[0-9]+]] = bitcast <8 x i64>* [[DstCast]] to <2 x i64>* +; OPT-NEXT: [[DstGep2:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[DstAs2xi64]], i64 4 +; OPT-NEXT: store <2 x i64> [[Load2]], <2 x i64>* [[DstGep2]] +; OPT-NEXT: [[SrcAs2Xi642:%[0-9]+]] = bitcast <8 x i64>* [[SrcCast]] to <2 x i64>* +; OPT-NEXT: [[SrcGep3:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[SrcAs2Xi642]], i64 5 +; OPT-NEXT: [[Load3:%[0-9]+]] = load <2 x i64>, <2 x i64>* [[SrcGep3]] +; OPT-NEXT: [[DstAs2Xi642:%[0-9]+]] = bitcast <8 x i64>* [[DstCast]] to <2 x i64>* +; OPT-NEXT: [[DstGep3:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[DstAs2Xi642]], i64 5 +; OPT-NEXT: store <2 x i64> [[Load3]], <2 x i64>* [[DstGep3]] +; OPT-NEXT: [[SrcAsi32:%[0-9]+]] = bitcast <8 x i64>* [[SrcCast]] to i32* +; OPT-NEXT: [[SrcGep4:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 24 +; OPT-NEXT: [[Load4:%[0-9]+]] = load i32, i32* [[SrcGep4]] +; OPT-NEXT: [[DstAsi32:%[0-9]+]] = bitcast <8 x i64>* [[DstCast]] to i32* +; OPT-NEXT: [[DstGep4:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 24 +; OPT-NEXT: store i32 [[Load4]], i32* [[DstGep4]] ; OPT-NEXT: ret i8* %dst } ; Check the expansion of a memcpy whose size argument is not a compile time ; constant. -define i8* @memcpy_unkown_size(i8* %dst, i8* %src, i64 %len) { +define i8* @memcpy_unkown_size(i8* %dst, i8* %src, i64 %len) !prof !29 { entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false) ret i8* %dst ; OPT-LABEL: @memcpy_unkown_size ; OPT: entry: -; OPT-NEXT: [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64* -; OPT-NEXT: [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64* -; OPT-NEXT: [[LoopCount:%[0-9]+]] = udiv i64 %len, 8 -; OPT-NEXT: [[ResBytes:%[0-9]+]] = urem i64 %len, 8 +; OPT-NEXT: [[SizeCmp:%[0-9]+]] = icmp ule i64 %len, 128 +; OPT-NEXT: br i1 %0, label %[[ExpLabel:[0-9]+]], label %[[NoExpLabel:[0-9]+]] + +; OPT: