Index: lib/Target/PowerPC/CMakeLists.txt =================================================================== --- lib/Target/PowerPC/CMakeLists.txt +++ lib/Target/PowerPC/CMakeLists.txt @@ -27,6 +27,7 @@ PPCFastISel.cpp PPCFrameLowering.cpp PPCLoopPreIncPrep.cpp + PPCLowerMemIntrinsics.cpp PPCMCInstLower.cpp PPCMachineFunctionInfo.cpp PPCMIPeephole.cpp Index: lib/Target/PowerPC/PPC.h =================================================================== --- lib/Target/PowerPC/PPC.h +++ lib/Target/PowerPC/PPC.h @@ -27,6 +27,7 @@ class FunctionPass; class MachineInstr; class MachineOperand; + class ModulePass; class AsmPrinter; class MCInst; class MCOperand; @@ -50,6 +51,9 @@ FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); FunctionPass *createPPCExpandISELPass(); + + ModulePass *createPPCLowerMemIntrinsicsPass(); + void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP, bool isDarwin); bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, @@ -60,6 +64,7 @@ void initializePPCBoolRetToIntPass(PassRegistry&); void initializePPCExpandISELPass(PassRegistry &); void initializePPCTLSDynamicCallPass(PassRegistry &); + void initializePPCLowerMemIntrinsicsPass(llvm::PassRegistry&); extern char &PPCVSXFMAMutateID; namespace PPCII { Index: lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp =================================================================== --- /dev/null +++ lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp @@ -0,0 +1,163 @@ +//===-------- PPCLowerMemIntrinsics.cpp - Expand memory instinsics -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// An IR to IR pass that expands llvm.memcpy intrinsics into the equivalent +/// load-store loops. +/// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" + +#define DEBUG_TYPE "ppc-memcpy-loop-lowering" + +// This pass will loop over all MemCpyInstrs and expand some of them into loops. +// For known compile time sizes, calls where the size belongs to +// [MemcpyLoopFloor, MemcpyLoopCeil] will be expanded. For unknown sizes we are +// expanding all call sites. + +STATISTIC(MemCpyLoopExpansions, "Number of memcpy calls expanded into a loop."); + +using namespace llvm; + +static cl::opt EnableMemcpyExpansionPass( + "ppc-enable-memcpy-loops", + cl::desc("Enable the PPC pass that lowers memcpy calls into loops."), + cl::init(false), cl::Hidden); + +// Options used to tune the size range where memcpy expansions occur. +static cl::opt MemcpyLoopFloor( + "ppc-memcpy-loop-floor", cl::Hidden, cl::init(129), + cl::desc( + "The lower size bound of memcpy calls to get expanded into a loop")); + +static cl::opt MemcpyLoopCeil( + "ppc-memcpy-loop-ceil", cl::Hidden, cl::init(256), + cl::desc("The upper size bound of memcpy calls to get expanded in a loop")); + +namespace { +class PPCLowerMemIntrinsics : public ModulePass { +public: + static char ID; + + PPCLowerMemIntrinsics() : ModulePass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + + bool runOnModule(Module &M) override; + /// Loops over all uses of llvm.memcpy and expands the call if warranted. + // \p MemcpyDecl is the function declaration of llvm.memcpy. + bool expandMemcopies(Function &MemcpyDecl); + + StringRef getPassName() const override { + return "PPC Lower memcpy into loops"; + } +}; +} // end anonymous namespace + + +// Checks whether the cpu arch is one where we want to expand +// memcpy calls. +static bool CPUCheck(const std::string &CpuStr) { + return StringSwitch(CpuStr) + .Case("pwr8", true) + .Case("pwr9", true) + .Case("ppc64le", true) // The default cpu for little-endian. + .Default(false); +} + +// Determines if we want to expand a specific memcpy call. +static bool shouldExpandMemCpy(MemCpyInst *MC) { + // If compiling for -O0, -Oz or -Os we don't want to expand. + Function *ParentFunc = MC->getParent()->getParent(); + if (ParentFunc->optForSize() || + ParentFunc->hasFnAttribute(Attribute::OptimizeNone)) + return false; + + // See if the cpu arch is one we want to expand for. If there is no + // target-cpu attibute assume we don't want to expand. + Attribute CPUAttr = ParentFunc->getFnAttribute("target-cpu"); + if (CPUAttr.hasAttribute(Attribute::None) || + !CPUCheck(CPUAttr.getValueAsString())) { + return false; + } + + // Expand known sizes within the range [MemcpyLoopFloor, MemcpyLoopCeil]. + ConstantInt *CISize = dyn_cast(MC->getLength()); + if (CISize) { + return CISize->getZExtValue() >= MemcpyLoopFloor && + CISize->getZExtValue() <= MemcpyLoopCeil; + } + + // Otherwise expand unkown sizes ... + return true; +} + +bool PPCLowerMemIntrinsics::expandMemcopies(Function &F) { + bool AnyExpanded = false; + assert(Intrinsic::memcpy == F.getIntrinsicID() && + "expandMemcopies called on wrong function declaration."); + // loop over all memcpy calls + for (auto I : F.users()) { + MemCpyInst *MC = dyn_cast(I); + assert(MC && "Must be a MemcpyInst!"); + if (shouldExpandMemCpy(MC)) { + Function *ParentFunc = MC->getParent()->getParent(); + const TargetTransformInfo &TTI = + getAnalysis().getTTI(*ParentFunc); + expandMemCpyAsLoop(MC, TTI); + MC->eraseFromParent(); + AnyExpanded = true; + ++MemCpyLoopExpansions; + } + } + return AnyExpanded; +} + +bool PPCLowerMemIntrinsics::runOnModule(Module &M) { + if (!EnableMemcpyExpansionPass || skipModule(M)) + return false; + + bool Modified = false; + for (Function &F : M) { + // Looking for the declaration of llvm.memcpy so we can skip + // any definition. + if (!F.isDeclaration()) + continue; + + switch (F.getIntrinsicID()) { + default: + break; + case Intrinsic::memcpy: + Modified = expandMemcopies(F); + } + } + + return Modified; +} + +ModulePass *llvm::createPPCLowerMemIntrinsicsPass() { + return new PPCLowerMemIntrinsics(); +} + +char PPCLowerMemIntrinsics::ID = 0; +INITIALIZE_PASS(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics", + "Lower mem intrinsics into loops", false, false) Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -102,6 +102,7 @@ initializePPCBoolRetToIntPass(PR); initializePPCExpandISELPass(PR); initializePPCTLSDynamicCallPass(PR); + initializePPCLowerMemIntrinsicsPass(PR); } /// Return the datalayout string of a subtarget. @@ -337,6 +338,10 @@ if (UsePrefetching) addPass(createLoopDataPrefetchPass()); + + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createPPCLowerMemIntrinsicsPass()); + if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) { // Call SeparateConstOffsetFromGEP pass to extract constants within indices // and lower a GEP with multiple indices to either arithmetic operations or Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -91,7 +91,13 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace); - + Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, + unsigned SrcAlign, unsigned DestAlign) const; + void getMemcpyLoopResidualLoweringType(SmallVectorImpl &OpsOut, + LLVMContext &Context, + unsigned RemainingBytes, + unsigned SrcAlign, + unsigned DestAlign) const; /// @} }; Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -480,3 +480,31 @@ return Cost; } +Type *PPCTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, + unsigned SrcAlign, + unsigned DestAlign) const { + return Type::getInt64Ty(Context); +} + +/// Decomposes a copy operation with size \p RemainingBytes into the individual +/// operands. +void PPCTTIImpl::getMemcpyLoopResidualLoweringType( + SmallVectorImpl &OpsOut, LLVMContext &Context, + unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const { + // Types to use in copy operations. + IntegerType *CopyTypes[] = { + Type::getInt64Ty(Context), Type::getInt32Ty(Context), + Type::getInt16Ty(Context), Type::getInt8Ty(Context)}; + + // Deconstructs the remaining bytes into individual operands. + for (auto OpTy : CopyTypes) { + unsigned OpSize = OpTy->getBitWidth() / 8; + // Loops just in case the remaining bytes are greater or equal to + // twice the largest copy operand type. + while (RemainingBytes >= OpSize) { + RemainingBytes -= OpSize; + OpsOut.push_back(OpTy); + } + } +} + Index: test/CodeGen/PowerPC/memcpy-loop-expansion.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/memcpy-loop-expansion.ll @@ -0,0 +1,163 @@ +; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu -ppc-memcpy-loop-floor=0 \ +; RUN: -mcpu=pwr8 %s| FileCheck -check-prefix=OPT %s +; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 %s | \ +; RUN: FileCheck %s --check-prefix PWR7 +; RUN: llc < %s -ppc-enable-memcpy-loops=true \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -O0 | \ +; RUN: FileCheck %s --check-prefix OPTNONE + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #0 + +; Check that memcpy calls with a known zero length are removed. +define i8* @memcpy_zero_size(i8* %dst, i8* %src) { +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 0, i32 1, i1 false) + ret i8* %dst + +; OPT-LABEL: @memcpy_zero_size +; OPT-NEXT: entry: +; OPT-NEXT: ret i8* %dst +} + +; Check that a memcpy call with a known size smaller then the loop operand +; type is handled properly. +define i8* @memcpy_small_size(i8* %dst, i8* %src) { +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 7, i32 1, i1 false) + ret i8* %dst + +; OPT-LABEL: @memcpy_small_size +; OPT-NEXT: entry: +; OPT-NEXT: [[SrcAsi32:%[0-9]+]] = bitcast i8* %src to i32* +; OPT-NEXT: [[SrcGep:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 0 +; OPT-NEXT: [[Load:%[0-9]+]] = load i32, i32* [[SrcGep]] +; OPT-NEXT: [[DstAsi32:%[0-9]+]] = bitcast i8* %dst to i32* +; OPT-NEXT: [[DstGep:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 0 +; OPT-NEXT: store i32 [[Load]], i32* [[DstGep]] +; OPT-NEXT: [[SrcAsi16:%[0-9]+]] = bitcast i8* %src to i16* +; OPT-NEXT: [[SrcGep2:%[0-9]+]] = getelementptr inbounds i16, i16* [[SrcAsi16]], i64 2 +; OPT-NEXT: [[Load2:%[0-9]+]] = load i16, i16* [[SrcGep2]] +; OPT-NEXT: [[DstAsi16:%[0-9]+]] = bitcast i8* %dst to i16* +; OPT-NEXT: [[DstGep2:%[0-9]+]] = getelementptr inbounds i16, i16* [[DstAsi16]], i64 2 +; OPT-NEXT: store i16 [[Load2]], i16* [[DstGep2]] +; OPT-NEXT: [[SrcGep3:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 6 +; OPT-NEXT: [[Load3:%[0-9]+]] = load i8, i8* [[SrcGep3]] +; OPT-NEXT: [[DstGep3:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 6 +; OPT-NEXT: store i8 [[Load3]], i8* [[DstGep3]] +; OPT-NEXT: ret i8* %dst +} + +; Check the expansion of a memcpy call with compile-time size. +define i8* @memcpy_known_size(i8* %dst, i8* %src) { +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 100, i32 1, i1 false) + ret i8* %dst +; OPT-LABEL: @memcpy_known_size +; OPT: entry: +; OPT-NEXT: [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64* +; OPT-NEXT: [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64* +; OPT-NEXT: br label %load-store-loop + +; OPT: load-store-loop: +; OPT-NEXT: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ] +; OPT-NEXT: [[SrcGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[SrcCast]], i64 %loop-index +; OPT-NEXT: [[Load:%[0-9]+]] = load i64, i64* [[SrcGep]] +; OPT-NEXT: [[DstGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[DstCast]], i64 %loop-index +; OPT-NEXT: store i64 [[Load]], i64* [[DstGep]] +; OPT-NEXT: [[IndexInc]] = add i64 %loop-index, 1 +; OPT-NEXT: [[CMP:%[0-9]+]] = icmp ult i64 [[IndexInc]], 12 +; OPT-NEXT: br i1 [[CMP]], label %load-store-loop, label %memcpy-split + +; OPT: memcpy-split: +; OPT-NEXT: [[SrcAsi32:%[0-9]+]] = bitcast i64* [[SrcCast]] to i32* +; OPT-NEXT: [[SrcGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 24 +; OPT-NEXT: [[Load2:%[0-9]+]] = load i32, i32* [[SrcGep2]] +; OPT-NEXT: [[DstAsi32:%[0-9]+]] = bitcast i64* [[DstCast]] to i32* +; OPT-NEXT: [[DstGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 24 +; OPT-NEXT: store i32 [[Load2]], i32* [[DstGep2]] +; OPT-NEXT: ret i8* %dst +} + + +; Check the expansion of a memcpy whose size argument is not a compile time +; constant. +define i8* @memcpy_unkown_size(i8* %dst, i8* %src, i64 %len) { +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false) + ret i8* %dst + +; OPT-LABEL: @memcpy_unkown_size +; OPT: entry: +; OPT-NEXT: [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64* +; OPT-NEXT: [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64* +; OPT-NEXT: [[LoopCount:%[0-9]+]] = udiv i64 %len, 8 +; OPT-NEXT: [[ResBytes:%[0-9]+]] = urem i64 %len, 8 +; OPT-NEXT: [[BytesCopied:%[0-9]+]] = sub i64 %len, [[ResBytes]] +; OPT-NEXT: [[Cmp:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0 +; OPT-NEXT: br i1 [[Cmp]], label %loop-memcpy-expansion, label %loop-memcpy-residual-header + +; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: ret i8* %dst + +; OPT: loop-memcpy-expansion: +; OPT-NEXT: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] +; OPT-NEXT: [[SrcGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[SrcCast]], i64 %loop-index +; OPT-NEXT: [[Load:%[0-9]+]] = load i64, i64* [[SrcGep]] +; OPT-NEXT: [[DstGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[DstCast]], i64 %loop-index +; OPT-NEXT: store i64 [[Load]], i64* [[DstGep]] +; OPT-NEXT: [[IndexInc]] = add i64 %loop-index, 1 +; OPT-NEXT: [[LoopCmp:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]] +; OPT-NEXT: br i1 [[LoopCmp]], label %loop-memcpy-expansion, label %loop-memcpy-residual-header + +; OPT: loop-memcpy-residual: +; OPT-NEXT: %residual-loop-index = phi i64 [ 0, %loop-memcpy-residual-header ], [ [[ResIndexInc:%[0-9]+]], %loop-memcpy-residual ] +; OPT-NEXT: [[SrcAsi8:%[0-9]+]] = bitcast i64* [[SrcCast]] to i8* +; OPT-NEXT: [[DstAsi8:%[0-9]+]] = bitcast i64* [[DstCast]] to i8* +; OPT-NEXT: [[ResIndex:%[0-9]+]] = add i64 [[BytesCopied]], %residual-loop-index +; OPT-NEXT: [[SrcGep2:%[0-9]+]] = getelementptr inbounds i8, i8* [[SrcAsi8]], i64 [[ResIndex]] +; OPT-NEXT: [[Load2:%[0-9]+]] = load i8, i8* [[SrcGep2]] +; OPT-NEXT: [[DstGep2:%[0-9]+]] = getelementptr inbounds i8, i8* [[DstAsi8]], i64 [[ResIndex]] +; OPT-NEXT: store i8 [[Load2]], i8* [[DstGep2]] +; OPT-NEXT: [[ResIndexInc]] = add i64 %residual-loop-index, 1 +; OPT-NEXT: [[RCmp:%[0-9]+]] = icmp ult i64 [[ResIndexInc]], [[ResBytes]] +; OPT-NEXT: br i1 [[RCmp]], label %loop-memcpy-residual, label %post-loop-memcpy-expansion + +; OPT: loop-memcpy-residual-header: +; OPT-NEXT: [[RHCmp:%[0-9]+]] = icmp ne i64 [[ResBytes]], 0 +; OPT-NEXT: br i1 [[RHCmp]], label %loop-memcpy-residual, label %post-loop-memcpy-expansion +} + +; Ensure the pass doens't expand memcpy calls when compiling a function with an +; unspported target_cpu attribute. +define i8* @memcpy_power7(i8* %dst, i8* %src, i64 %len) #1 { +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false) + ret i8* %dst +; PWR7-LABEL: @memcpy_power7 +; PWR7: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false) +} + +; Ensure the pass doens't expand calls in a function compiled for size. +define i8* @memcpy_opt_small(i8* %dst, i8* %src, i64 %len) #2 { + entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false) + ret i8* %dst +; OPT-LABEL: @memcpy_opt_small +; OPT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false) +} + +; Ensure the pass doesn't expand calls on functions not compiled with +; optimizations. +define i8* @memcpy_opt_none(i8* %dst, i8* %src, i64 %len) { + entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false) + ret i8* %dst +; OPTNONE-LABEL: @memcpy_opt_none +; OPTNONE: bl memcpy +} + +attributes #0 = { argmemonly nounwind } +attributes #1 = { "target-cpu"="pwr7" } +attributes #2 = { "target-cpu"="pwr8" optsize }