Index: lib/Target/PowerPC/CMakeLists.txt =================================================================== --- lib/Target/PowerPC/CMakeLists.txt +++ lib/Target/PowerPC/CMakeLists.txt @@ -21,6 +21,7 @@ PPCISelDAGToDAG.cpp PPCISelLowering.cpp PPCEarlyReturn.cpp + PPCEnableShrinkWrap.cpp PPCFastISel.cpp PPCFrameLowering.cpp PPCLoopDataPrefetch.cpp Index: lib/Target/PowerPC/PPC.h =================================================================== --- lib/Target/PowerPC/PPC.h +++ lib/Target/PowerPC/PPC.h @@ -27,9 +27,10 @@ class FunctionPass; class ImmutablePass; class MachineInstr; + class ModulePass; class AsmPrinter; class MCInst; FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM); #ifndef NDEBUG FunctionPass *createPPCCTRLoopsVerify(); @@ -44,60 +45,62 @@ FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM); FunctionPass *createPPCTLSDynamicCallPass(); + ModulePass *createPPCEnableShrinkWrapPass(); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP, bool isDarwin); void initializePPCVSXFMAMutatePass(PassRegistry&); + void initializePPCEnableShrinkWrapPass(PassRegistry&); extern char &PPCVSXFMAMutateID; namespace PPCII { /// Target Operand Flag enum. enum TOF { //===------------------------------------------------------------------===// // PPC Specific MachineOperand flags. MO_NO_FLAG, /// MO_PLT_OR_STUB - On a symbol operand "FOO", this indicates that the /// reference is actually to the "FOO$stub" or "FOO@plt" symbol. This is /// used for calls and jumps to external functions on Tiger and earlier, and /// for PIC calls on Linux and ELF systems. MO_PLT_OR_STUB = 1, /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to /// the function's picbase, e.g. lo16(symbol-picbase). MO_PIC_FLAG = 2, /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase). MO_NLP_FLAG = 4, /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a /// symbol with hidden visibility. This causes a different kind of /// non-lazy-pointer to be generated. MO_NLP_HIDDEN_FLAG = 8, /// The next are not flags but distinct values. MO_ACCESS_MASK = 0xf0, /// MO_LO, MO_HA - lo16(symbol) and ha16(symbol) MO_LO = 1 << 4, MO_HA = 2 << 4, MO_TPREL_LO = 4 << 4, MO_TPREL_HA = 3 << 4, /// These values identify relocations on immediates folded /// into memory operations. MO_DTPREL_LO = 5 << 4, MO_TLSLD_LO = 6 << 4, MO_TOC_LO = 7 << 4, // Symbol for VK_PPC_TLS fixup attached to an ADD instruction MO_TLS = 8 << 4 }; } // end namespace PPCII } // end namespace llvm; #endif Index: lib/Target/PowerPC/PPCEnableShrinkWrap.cpp =================================================================== --- lib/Target/PowerPC/PPCEnableShrinkWrap.cpp +++ lib/Target/PowerPC/PPCEnableShrinkWrap.cpp @@ -0,0 +1,259 @@ +//===- PPCEnableShrinkWrap.cpp - split functions to create shrink wrapping +// opportunities ==/ +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Shrink wrapping functions can improve performance for functions when the +// stack is unused. However, all functions where value live-ranges contain +// function calls are ineligible for shrink wrapping. Why? +// +// 1. To avoid saving and restoring values across call-sites, LLVM allocates +// values with live-ranges that cross call sites to callee-saved registers +// +// 2. If callee-saved registers are used, they must be saved (usually in the +// function prolog) +// +// 3. ShrinkWrapping scans from the beginning of the function prolog to the +// first use of the stack. If a callee-saved register is saved to the stack in +// the function prolog, ShrinkWrapping will give up early. +// +// To increase the applicability of ShrinkWrapping, this pass identifies +// instances where a live-range straddling a call-site prevents ShrinkWrapping, +// and splits the original function into a stackless ShrinkWrappable stub that +// potentially makes a tail call to the remainder of the function. +// +// This transformation harms debuggability and consequently is not suitable for +// lower -O levels. +// +//===----------------------------------------------------------------------===// +#include "PPC.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Pass.h" + +using namespace llvm; + +class PPCEnableShrinkWrap : public ModulePass { + + static BasicBlock *getUniqueExitBlock(Function &F) { + BasicBlock *ExitBlock = nullptr; + for (auto &BB : F) { + if (isa(BB.getTerminator())) { + if (ExitBlock) + return nullptr; + else + ExitBlock = &BB; + } + } + return ExitBlock; + } + + static bool branchesTo(TerminatorInst *term, BasicBlock *succ) { + for (unsigned i = 0; i < term->getNumSuccessors(); ++i) + if (term->getSuccessor(i) == succ) + return true; + return false; + } + + static bool canBeComputedWithoutStack(llvm::Value *V) { + if (isa(V)) + return true; + if (isa(V)) + return true; + if (isa(V)) + return true; + if (isa(V)) + return true; + if (isa(V)) + return false; + + if (IntrinsicInst *I = dyn_cast(V)) + return I->getIntrinsicID() == Intrinsic::lifetime_end || + I->getIntrinsicID() == Intrinsic::lifetime_start; + + if (isa(V)) + return false; + + Instruction *I = dyn_cast(V); + if (!I) + return false; + if (I->mayHaveSideEffects()) + return false; + if (I->mayReadFromMemory()) + return false; + + for (unsigned i = 0; i < I->getNumOperands(); ++i) + if (!canBeComputedWithoutStack(I->getOperand(i))) + return false; + + return true; + } + + static bool canBeComputedWithoutStack(BasicBlock *exitBlock, + BasicBlock &entry) { + for (auto &I : *exitBlock) { + if (PHINode *Phi = dyn_cast(&I)) { + if (!canBeComputedWithoutStack(Phi->getIncomingValueForBlock(&entry))) + return false; + } else if (!canBeComputedWithoutStack(&I)) { + return false; + } + } + return true; + } + + static bool hasCallInst(Function &F) { + for (auto &BB : F) + for (auto &I : BB) + if (isa(I) && !isa(I)) + return true; + return false; + } + + static bool hasVariadicCall(Function &F) { + for (auto &BB : F) + for (auto &I : BB) + if (CallInst *CI = dyn_cast(&I)) + if (CI->getFunctionType()->isVarArg()) + return true; + return false; + } + + static bool mayHaveSideEffects(BasicBlock &BB) { + for (auto &I : BB) + if (I.mayHaveSideEffects()) + return true; + return false; + } + +public: + static char ID; + PPCEnableShrinkWrap() : ModulePass(ID) { + initializePPCEnableShrinkWrapPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + ModulePass::getAnalysisUsage(AU); + } + + bool runOnModule(Module &M) { + bool Changed = false; + SmallVector Funcs; + for (auto &F : M) + Funcs.emplace_back(&F); + for (auto &F : Funcs) + if (!F->isDeclaration()) + Changed |= runOnFunction(*F); + return Changed; + } + + static bool runOnFunction(Function &F) { + // Presently, only implemented for non-variadic functions that don't call + // variadic functions and that contain at least one callsite. + if (F.isVarArg()) + return false; + if (!hasCallInst(F)) + return false; + if (hasVariadicCall(F)) + return false; + + // Only unique exit blocks are handled + BasicBlock *ExitBlock = getUniqueExitBlock(F); + if (!ExitBlock) + return false; + + // Look for the pattern of entry blocks that immediately branch to exit + // where the entry block and the exit block can be computed without the + // stack. + BasicBlock &Entry = F.getEntryBlock(); + BranchInst *EntryTerm = dyn_cast(Entry.getTerminator()); + if (!EntryTerm) + return false; + if (EntryTerm->isUnconditional()) + return false; + if (isa(EntryTerm->getCondition())) + return false; + if (!branchesTo(EntryTerm, ExitBlock)) + return false; + if (!canBeComputedWithoutStack(EntryTerm)) + return false; + if (!canBeComputedWithoutStack(ExitBlock, Entry)) + return false; + if (mayHaveSideEffects(Entry)) + return false; + + // Clone the original function + ValueToValueMapTy VMap; + ClonedCodeInfo CodeInfo; + Function *CloneF = CloneFunction(&F, VMap, true, &CodeInfo); + CloneF->setName(F.getName() + ".cont"); + CloneF->setLinkage(GlobalValue::InternalLinkage); + F.getParent()->getFunctionList().push_back(CloneF); + + // Replace the non-exit block successor of the entry block that makes a tail + // call to the cloned version of the function. + LLVMContext &C = F.getContext(); + BasicBlock *TailCallBB = BasicBlock::Create(C, "tailCallBB", &F); + for (unsigned i = 0; i < EntryTerm->getNumSuccessors(); ++i) { + BasicBlock *Succ = EntryTerm->getSuccessor(i); + if (Succ != ExitBlock) { + for (auto I = Succ->begin(); &*I != Succ->getFirstNonPHI(); ++I) { + PHINode *Phi = cast(I); + Phi->removeIncomingValue(&Entry); + } + EntryTerm->setSuccessor(i, TailCallBB); + } + } + // Create the tail call + SmallVector Args; + for (auto &Arg : F.args()) + Args.emplace_back(&Arg); + CallInst *TailCall = CallInst::Create(CloneF, Args, "", TailCallBB); + TailCall->setTailCall(true); + TailCall->setCallingConv(CloneF->getCallingConv()); + // Return the result of the tail call if necessary. + if (F.getReturnType()->isVoidTy()) + ReturnInst::Create(C, TailCallBB); + else + ReturnInst::Create(C, TailCall, TailCallBB); + + // In the clone function, we know the non-early exit path will be taken + BranchInst *CloneEntryTerm = cast(VMap[EntryTerm]); + Value *Cond = CloneEntryTerm->getSuccessor(0) == VMap[ExitBlock] + ? ConstantInt::getFalse(C) + : ConstantInt::getTrue(C); + CloneEntryTerm->setCondition(Cond); + // Add a hint to favor the early exit path. This improves performance when + // the early exit path is common and does not harm performance much if the + // early exit path is rare and the non-early exit path is relatively long + // running. + ExitBlock->moveAfter(&Entry); + SmallVector Weights = { + EntryTerm->getSuccessor(0) == ExitBlock ? 100u : 1u, + EntryTerm->getSuccessor(0) == ExitBlock ? 1u : 100u}; + EntryTerm->setMetadata(LLVMContext::MD_prof, + MDBuilder(C).createBranchWeights(Weights)); + return true; + } +}; + +char PPCEnableShrinkWrap::ID = 0; +INITIALIZE_PASS(PPCEnableShrinkWrap, "create-wrapper-opps", + "Find opportunities for wrapping", false, false) + +ModulePass *llvm::createPPCEnableShrinkWrapPass() { + return new PPCEnableShrinkWrap(); +} Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -67,46 +67,49 @@ RegisterTargetMachine A(ThePPC32Target); RegisterTargetMachine B(ThePPC64Target); RegisterTargetMachine C(ThePPC64LETarget); + + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializePPCEnableShrinkWrapPass(PR); } /// Return the datalayout string of a subtarget. static std::string getDataLayoutString(const Triple &T) { bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le; std::string Ret; // Most PPC* platforms are big endian, PPC64LE is little endian. if (T.getArch() == Triple::ppc64le) Ret = "e"; else Ret = "E"; Ret += DataLayout::getManglingComponent(T); // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit // pointers. if (!is64Bit || T.getOS() == Triple::Lv2) Ret += "-p:32:32"; // Note, the alignment values for f64 and i64 on ppc64 in Darwin // documentation are wrong; these are correct (i.e. "what gcc does"). if (is64Bit || !T.isOSDarwin()) Ret += "-i64:64"; else Ret += "-f64:32:64"; // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones. if (is64Bit) Ret += "-n32:64"; else Ret += "-n32"; return Ret; } static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, const Triple &TT) { std::string FullFS = FS; // Make sure 64-bit features are available when CPUname is generic if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) { if (!FullFS.empty()) @@ -282,8 +285,11 @@ } void PPCPassConfig::addIRPasses() { + if (TM->getOptLevel() > CodeGenOpt::Default) + addPass(createPPCEnableShrinkWrapPass()); + addPass(createAtomicExpandPass(&getPPCTargetMachine())); // For the BG/Q (or if explicitly requested), add explicit data prefetch // intrinsics. bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ && Index: test/CodeGen/PowerPC/EnableShrinkWrapTest.ll =================================================================== --- test/CodeGen/PowerPC/EnableShrinkWrapTest.ll +++ test/CodeGen/PowerPC/EnableShrinkWrapTest.ll @@ -0,0 +1,104 @@ +; RUN: opt -create-wrapper-opps -S -o - < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; CHECK-LABEL: testEnableShrinkWrap +define signext i32 @testEnableShrinkWrap(i32 signext %i) { +entry: + %tobool = icmp eq i32 %i, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry +; CHECK call i32 @testEnableStrinkWrap.cont + tail call void @doSomething() + br label %if.end + +if.end: ; preds = %entry, %if.then + ret i32 %i +} + +; CHECK-LABEL: doSomething +declare void @doSomething() + +; CHECK-LABEL: testEnableShrinkWrapVariadic +define signext i32 @testEnableShrinkWrapVariadic(i32 signext %i, ...) { +entry: + %tobool = icmp eq i32 %i, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry +; CHECK: call void @doSomething +tail call void @doSomething() + br label %if.end + +if.end: ; preds = %entry, %if.then + ret i32 %i +} + +; CHECK-LABEL: callVariadic +define signext i32 @callVariadic(i32 signext %i) { +entry: + %tobool = icmp eq i32 %i, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry +; CHECK: call void (i32, ...) @doSomethingVariadic + tail call void (i32, ...) @doSomethingVariadic(i32 signext %i, i32 signext 7) + br label %if.end + +if.end: ; preds = %entry, %if.then + ret i32 %i +} + +; CHECK-LABEL: declare void @doSomethingVariadic +declare void @doSomethingVariadic(i32 signext, ...) + +; CHECK-LABEL: tooComplicatedControlFlow +define signext i32 @tooComplicatedControlFlow(i32 zeroext %i, i32 zeroext %j) { +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %i.addr.0 = phi i32 [ %i, %entry ], [ %dec, %do.body ] +; CHECK: call void @doSomething + tail call void @doSomething() + %dec = add i32 %i.addr.0, -1 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %do.body.1.preheader, label %do.body + +do.body.1.preheader: ; preds = %do.body + br label %do.body.1 + +do.body.1: ; preds = %do.body.1.preheader, %do.body.1 + %j.addr.0 = phi i32 [ %dec3, %do.body.1 ], [ %j, %do.body.1.preheader ] +; CHECK: call void @doSomething + tail call void @doSomething() + %dec3 = add i32 %j.addr.0, -1 + %tobool4 = icmp eq i32 %dec3, 0 + br i1 %tobool4, label %do.end.5, label %do.body.1 + +do.end.5: ; preds = %do.body.1 + ret i32 7 +} + +; CHECK-LABEL: notSideEffectFree +define signext i32 @notSideEffectFree(i32* nocapture readonly %i) { +entry: + %incdec.ptr = getelementptr inbounds i32, i32* %i, i64 1 + %0 = load i32, i32* %i, align 4 + %tobool = icmp eq i32 %0, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry +; CHECK: call void @doSomething + tail call void @doSomething() + br label %if.end + +if.end: ; preds = %entry, %if.then + %1 = load i32, i32* %incdec.ptr, align 4 + ret i32 %1 +} + +; CHECK-LABEL: testEnableShrinkWrap.cont +; CHECK: call void @doSomething