Index: lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp =================================================================== --- lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp +++ lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp @@ -15,7 +15,10 @@ #include "PPC.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -24,23 +27,26 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #define DEBUG_TYPE "ppc-memcpy-loop-lowering" // This pass will loop over all MemCpyInstrs and expand some of them into loops. // For known compile time sizes, calls where the size belongs to // [MemcpyLoopFloor, MemcpyLoopCeil] will be expanded. For unknown sizes we are -// currently expanding all call sites. The pass is off by default and can be -// enabled with 'ppc-enable-memcpy-loops=true'. +// currently not expanding memcpy calls be default. memcpy calls of unknown sizes +// in hot paths can be expanded by the flag 'ppc-memcpy-unknown-loops=true'. +// The pass is on by default and can be disabled with 'ppc-enable-memcpy-loops=false'. -STATISTIC(MemCpyLoopExpansions, "Number of memcpy calls expanded into a loop."); +STATISTIC(MemCpyLoopExpansions, + "Total number of memcpy calls expanded into a loop."); using namespace llvm; static cl::opt EnableMemcpyExpansionPass( "ppc-enable-memcpy-loops", cl::desc("Enable the PPC pass that lowers memcpy calls into loops."), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); // Options used to tune the size range where memcpy expansions occur. static cl::opt MemcpyLoopFloor( @@ -49,9 +55,28 @@ "The lower size bound of memcpy calls to get expanded into a loop")); static cl::opt MemcpyLoopCeil( - "ppc-memcpy-loop-ceil", cl::Hidden, cl::init(256), + "ppc-memcpy-loop-ceil", cl::Hidden, cl::init(512), cl::desc("The upper size bound of memcpy calls to get expanded in a loop")); +static cl::opt MemcpyLoopUnknownThreshold( + "ppc-memcpy-loop-unknown-threshold", cl::Hidden, cl::init(128), + cl::desc("The upper size bound of memcpy calls to get expanded in a loop for unknown sizes")); + +static cl::opt MemcpyLoopDoKnown( + "ppc-memcpy-known-loops", + cl::desc("Enable memcpy loop expansion for known size loops."), + cl::init(true), cl::Hidden); + +static cl::opt MemcpyLoopDoUnknown( + "ppc-memcpy-unknown-loops", + cl::desc("Enable memcpy loop expansion for unknown size loops."), + cl::init(false), cl::Hidden); + +static cl::opt MemcpyLoopDoUnknownNonPGO( + "ppc-memcpy-non-pgo-unknown-loops", + cl::desc("Enable memcpy loop expansion for unknown size loops even without PGO information."), + cl::init(false), cl::Hidden); + namespace { class PPCLowerMemIntrinsics : public ModulePass { public: @@ -61,8 +86,12 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); + AU.addRequired(); } + bool shouldExpandMemCpy(MemCpyInst *MC, ProfileSummaryInfo *PSI, + BlockFrequencyInfo &BFI); bool runOnModule(Module &M) override; /// Loops over all uses of llvm.memcpy and expands the call if warranted. // \p MemcpyDecl is the function declaration of llvm.memcpy. @@ -74,7 +103,6 @@ }; } // end anonymous namespace - // Checks whether the cpu arch is one where we want to expand // memcpy calls. static bool CPUCheck(const std::string &CpuStr) { @@ -86,7 +114,8 @@ } // Determines if we want to expand a specific memcpy call. -static bool shouldExpandMemCpy(MemCpyInst *MC) { +bool PPCLowerMemIntrinsics::shouldExpandMemCpy(MemCpyInst *MC, + ProfileSummaryInfo *PSI, BlockFrequencyInfo &BFI) { // If compiling for -O0, -Oz or -Os we don't want to expand. Function *ParentFunc = MC->getParent()->getParent(); if (ParentFunc->optForSize() || @@ -101,15 +130,42 @@ return false; } - // Expand known sizes within the range [MemcpyLoopFloor, MemcpyLoopCeil]. + // Check if the memcpy call has a known size. ConstantInt *CISize = dyn_cast(MC->getLength()); - if (CISize) { - return CISize->getZExtValue() >= MemcpyLoopFloor && - CISize->getZExtValue() <= MemcpyLoopCeil; + if (CISize && !MemcpyLoopDoKnown) + return false; + + if (!CISize && !MemcpyLoopDoUnknown) + return false; + + // Do not expand memcpy calls within cold call sites. + bool HasPGOInfo = false; + if (PSI) { + Optional Count = PSI->getProfileCount(MC, &BFI); + if (Count.hasValue()) { + HasPGOInfo = true; + if (PSI->isColdCallSite(CallSite(MC), &BFI)) + return false; + } } - // Otherwise expand unkown sizes ... - return true; + // Expand known sizes within the range [MemcpyLoopFloor, MemcpyLoopCeil]. + if (CISize) + return CISize->getZExtValue() >= MemcpyLoopFloor && + CISize->getZExtValue() <= MemcpyLoopCeil; + + // For unknown size, only version if there is PGO info. + return (HasPGOInfo || MemcpyLoopDoUnknownNonPGO); +} + +// Return the condition to be used to determine unknown size memCpy expansion. +static Value *getExpandUnknownSizeMemCpyCond(MemCpyInst *MI) { + + IRBuilder<> Builder(MI); + Value *Op1 = MI->getLength(); + Value *Op2 = ConstantInt::get(Op1->getType(), MemcpyLoopUnknownThreshold); + Value *Cond = Builder.CreateICmpULE(Op1, Op2); + return Cond; } // Wrapper function that determines which expansion to call depending on if the @@ -122,10 +178,20 @@ ConstLen, MI->getAlignment(), MI->getAlignment(), MI->isVolatile(), MI->isVolatile(), TTI); } else { - createMemCpyLoopUnknownSize(MI, MI->getRawSource(), MI->getRawDest(), + // Create an if-then-else block and insert it before the memCpy instruction. + TerminatorInst *ThenTerm, *ElseTerm; + SplitBlockAndInsertIfThenElse(getExpandUnknownSizeMemCpyCond(MI), + MI, &ThenTerm, &ElseTerm, nullptr); + // Generate the memCpy expansion loop in the then-block. + createMemCpyLoopUnknownSize(ThenTerm, MI->getRawSource(), MI->getRawDest(), MI->getLength(), MI->getAlignment(), MI->getAlignment(), MI->isVolatile(), MI->isVolatile(), TTI); + + // Create a copy of MI and instert it into the else-block. + IRBuilder<> Builder(MI); + Builder.SetInsertPoint(ElseTerm); + Builder.Insert(MI->clone()); } } @@ -133,18 +199,27 @@ bool AnyExpanded = false; assert(Intrinsic::memcpy == F.getIntrinsicID() && "expandMemcopies called on wrong function declaration."); - // loop over all memcpy calls + + // Obtain profiling information. + ProfileSummaryInfo *PSI = + getAnalysis().getPSI(); + + // Loop over all of the memcpy calls. for (auto I : F.users()) { MemCpyInst *MC = dyn_cast(I); assert(MC && "Must be a MemcpyInst!"); - if (shouldExpandMemCpy(MC)) { - Function *ParentFunc = MC->getParent()->getParent(); + + Function *ParentFunc = MC->getParent()->getParent(); + BlockFrequencyInfo &BFI = + getAnalysis(*ParentFunc).getBFI(); + + if (shouldExpandMemCpy(MC, PSI, BFI)) { const TargetTransformInfo &TTI = getAnalysis().getTTI(*ParentFunc); ppcExpandMemCpyAsLoop(MC, TTI); MC->eraseFromParent(); AnyExpanded = true; - MemCpyLoopExpansions += 1; + ++MemCpyLoopExpansions; } } return AnyExpanded; @@ -177,5 +252,9 @@ } char PPCLowerMemIntrinsics::ID = 0; -INITIALIZE_PASS(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics", - "Lower mem intrinsics into loops", false, false) +INITIALIZE_PASS_BEGIN(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics", + "Lower mem intrinsics into loops", false, false) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_END(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics", + "Lower mem intrinsics into loops", false, false) Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -483,7 +483,7 @@ Type *PPCTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAlign, unsigned DestAlign) const { - return Type::getInt64Ty(Context); + return VectorType::get(Type::getInt64Ty(Context),8); } /// Decomposes a copy operation with size \p RemainingBytes into the individual @@ -492,13 +492,16 @@ SmallVectorImpl &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const { // Types to use in copy operations. - IntegerType *CopyTypes[] = { + Type *CopyTypes[] = { + VectorType::get(Type::getInt64Ty(Context),2), Type::getInt64Ty(Context), Type::getInt32Ty(Context), Type::getInt16Ty(Context), Type::getInt8Ty(Context)}; // Deconstructs the remaining bytes into individual operands. for (auto OpTy : CopyTypes) { - unsigned OpSize = OpTy->getBitWidth() / 8; + unsigned OpSize = OpTy->getScalarSizeInBits() / 8; + if (OpTy->isVectorTy()) + OpSize *= OpTy->getVectorNumElements(); // Loops just in case the remaining bytes are greater or equal to // twice the largest copy operand type. while (RemainingBytes >= OpSize) { Index: lib/Transforms/Utils/LowerMemIntrinsics.cpp =================================================================== --- lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -98,7 +98,7 @@ SrcAlign = std::min(SrcAlign, LoopOpSize); DestAlign = std::min(DestAlign, LoopOpSize); - SmallVector RemainingOps; + SmallVector RemainingOps; TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, SrcAlign, DestAlign); Index: test/CodeGen/PowerPC/memcpy-loop-expansion.ll =================================================================== --- test/CodeGen/PowerPC/memcpy-loop-expansion.ll +++ test/CodeGen/PowerPC/memcpy-loop-expansion.ll @@ -1,10 +1,17 @@ ; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \ +; RUN: -ppc-memcpy-unknown-loops=true \ ; RUN: -mtriple=powerpc64le-unknown-linux-gnu -ppc-memcpy-loop-floor=0 \ ; RUN: -mcpu=pwr8 %s| FileCheck -check-prefix=OPT %s ; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \ +; RUN: -ppc-memcpy-unknown-loops=true \ ; RUN: -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 %s | \ ; RUN: FileCheck %s --check-prefix PWR7 +; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \ +; RUN: -ppc-memcpy-unknown-loops=true \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s | \ +; RUN: FileCheck %s --check-prefix OPTSMALL ; RUN: llc < %s -ppc-enable-memcpy-loops=true \ +; RUN: -ppc-memcpy-unknown-loops=true \ ; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -O0 | \ ; RUN: FileCheck %s --check-prefix OPTNONE @@ -56,65 +63,89 @@ ret i8* %dst ; OPT-LABEL: @memcpy_known_size ; OPT: entry: -; OPT-NEXT: [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64* -; OPT-NEXT: [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64* +; OPT-NEXT: [[SrcCast:%[0-9]+]] = bitcast i8* %src to <8 x i64>* +; OPT-NEXT: [[DstCast:%[0-9]+]] = bitcast i8* %dst to <8 x i64>* ; OPT-NEXT: br label %load-store-loop ; OPT: load-store-loop: ; OPT-NEXT: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ] -; OPT-NEXT: [[SrcGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[SrcCast]], i64 %loop-index -; OPT-NEXT: [[Load:%[0-9]+]] = load i64, i64* [[SrcGep]] -; OPT-NEXT: [[DstGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[DstCast]], i64 %loop-index -; OPT-NEXT: store i64 [[Load]], i64* [[DstGep]] +; OPT-NEXT: [[SrcGep:%[0-9]+]] = getelementptr inbounds <8 x i64>, <8 x i64>* [[SrcCast]], i64 %loop-index +; OPT-NEXT: [[Load:%[0-9]+]] = load <8 x i64>, <8 x i64>* [[SrcGep]] +; OPT-NEXT: [[DstGep:%[0-9]+]] = getelementptr inbounds <8 x i64>, <8 x i64>* [[DstCast]], i64 %loop-index +; OPT-NEXT: store <8 x i64> [[Load]], <8 x i64>* [[DstGep]] ; OPT-NEXT: [[IndexInc]] = add i64 %loop-index, 1 -; OPT-NEXT: [[CMP:%[0-9]+]] = icmp ult i64 [[IndexInc]], 12 +; OPT-NEXT: [[CMP:%[0-9]+]] = icmp ult i64 [[IndexInc]], 1 ; OPT-NEXT: br i1 [[CMP]], label %load-store-loop, label %memcpy-split ; OPT: memcpy-split: -; OPT-NEXT: [[SrcAsi32:%[0-9]+]] = bitcast i64* [[SrcCast]] to i32* -; OPT-NEXT: [[SrcGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 24 -; OPT-NEXT: [[Load2:%[0-9]+]] = load i32, i32* [[SrcGep2]] -; OPT-NEXT: [[DstAsi32:%[0-9]+]] = bitcast i64* [[DstCast]] to i32* -; OPT-NEXT: [[DstGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 24 -; OPT-NEXT: store i32 [[Load2]], i32* [[DstGep2]] +; OPT-NEXT: [[SrcAs2Xi64:%[0-9]+]] = bitcast <8 x i64>* [[SrcCast]] to <2 x i64>* +; OPT-NEXT: [[SrcGep2:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[SrcAs2Xi64]], i64 4 +; OPT-NEXT: [[Load2:%[0-9]+]] = load <2 x i64>, <2 x i64>* [[SrcGep2]] +; OPT-NEXT: [[DstAs2xi64:%[0-9]+]] = bitcast <8 x i64>* [[DstCast]] to <2 x i64>* +; OPT-NEXT: [[DstGep2:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[DstAs2xi64]], i64 4 +; OPT-NEXT: store <2 x i64> [[Load2]], <2 x i64>* [[DstGep2]] +; OPT-NEXT: [[SrcAs2Xi642:%[0-9]+]] = bitcast <8 x i64>* [[SrcCast]] to <2 x i64>* +; OPT-NEXT: [[SrcGep3:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[SrcAs2Xi642]], i64 5 +; OPT-NEXT: [[Load3:%[0-9]+]] = load <2 x i64>, <2 x i64>* [[SrcGep3]] +; OPT-NEXT: [[DstAs2Xi642:%[0-9]+]] = bitcast <8 x i64>* [[DstCast]] to <2 x i64>* +; OPT-NEXT: [[DstGep3:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[DstAs2Xi642]], i64 5 +; OPT-NEXT: store <2 x i64> [[Load3]], <2 x i64>* [[DstGep3]] +; OPT-NEXT: [[SrcAsi32:%[0-9]+]] = bitcast <8 x i64>* [[SrcCast]] to i32* +; OPT-NEXT: [[SrcGep4:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 24 +; OPT-NEXT: [[Load4:%[0-9]+]] = load i32, i32* [[SrcGep4]] +; OPT-NEXT: [[DstAsi32:%[0-9]+]] = bitcast <8 x i64>* [[DstCast]] to i32* +; OPT-NEXT: [[DstGep4:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 24 +; OPT-NEXT: store i32 [[Load4]], i32* [[DstGep4]] ; OPT-NEXT: ret i8* %dst } ; Check the expansion of a memcpy whose size argument is not a compile time ; constant. -define i8* @memcpy_unkown_size(i8* %dst, i8* %src, i64 %len) { +define i8* @memcpy_unkown_size(i8* %dst, i8* %src, i64 %len) !prof !29 { entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false) ret i8* %dst ; OPT-LABEL: @memcpy_unkown_size ; OPT: entry: -; OPT-NEXT: [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64* -; OPT-NEXT: [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64* -; OPT-NEXT: [[LoopCount:%[0-9]+]] = udiv i64 %len, 8 -; OPT-NEXT: [[ResBytes:%[0-9]+]] = urem i64 %len, 8 +; OPT-NEXT: [[SizeCmp:%[0-9]+]] = icmp ule i64 %len, 128 +; OPT-NEXT: br i1 %0, label %[[ExpLabel:[0-9]+]], label %[[NoExpLabel:[0-9]+]] + +; OPT: