Index: lib/Target/PowerPC/CMakeLists.txt =================================================================== --- lib/Target/PowerPC/CMakeLists.txt +++ lib/Target/PowerPC/CMakeLists.txt @@ -22,6 +22,7 @@ PPCISelDAGToDAG.cpp PPCISelLowering.cpp PPCEarlyReturn.cpp + PPCEnableShrinkWrap.cpp PPCFastISel.cpp PPCFrameLowering.cpp PPCLoopDataPrefetch.cpp Index: lib/Target/PowerPC/PPC.h =================================================================== --- lib/Target/PowerPC/PPC.h +++ lib/Target/PowerPC/PPC.h @@ -27,6 +27,7 @@ class FunctionPass; class ImmutablePass; class MachineInstr; + class ModulePass; class AsmPrinter; class MCInst; @@ -46,11 +47,13 @@ FunctionPass *createPPCISelDag(PPCTargetMachine &TM); FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); + ModulePass *createPPCEnableShrinkWrapPass(); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP, bool isDarwin); void initializePPCVSXFMAMutatePass(PassRegistry&); void initializePPCBoolRetToIntPass(PassRegistry&); + void initializePPCEnableShrinkWrapPass(PassRegistry&); extern char &PPCVSXFMAMutateID; namespace PPCII { Index: lib/Target/PowerPC/PPCEnableShrinkWrap.cpp =================================================================== --- lib/Target/PowerPC/PPCEnableShrinkWrap.cpp +++ lib/Target/PowerPC/PPCEnableShrinkWrap.cpp @@ -0,0 +1,247 @@ +//===- PPCEnableShrinkWrap.cpp - split functions to create shrink wrapping +// opportunities ==/ +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Shrink wrapping functions can improve performance for functions when the +// stack is unused. However, all functions where value live-ranges contain +// function calls are ineligible for shrink wrapping. Why? +// +// 1. To avoid saving and restoring values across call-sites, LLVM allocates +// values with live-ranges that cross call sites to callee-saved registers +// +// 2. If callee-saved registers are used, they must be saved (usually in the +// function prolog) +// +// 3. ShrinkWrapping scans from the beginning of the function prolog to the +// first use of the stack. If a callee-saved register is saved to the stack in +// the function prolog, ShrinkWrapping will give up early. +// +// To increase the applicability of ShrinkWrapping, this pass identifies +// instances where a live-range straddling a call-site prevents ShrinkWrapping. +// The pass switches the calling convention to CXX_FAST_TLS, that saves callee +// registers to virtual registers. This allows the backend to make better +// choices about where to save registers and in many cases avoids saving +// registers to the stack. The exit block is privatized for the fast and slow +// exit paths to allow each to be specialized independently. +// +//===----------------------------------------------------------------------===// +#include "PPC.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Pass.h" + +using namespace llvm; + +static cl:: +opt DisablePPCEnableShrink("disable-ppc-enable-shrink-wrap", cl::Hidden, + cl::desc("Disable PPC enable shrink wrap pass")); + +class PPCEnableShrinkWrap : public ModulePass { + + static BasicBlock *getUniqueExitBlock(Function &F) { + BasicBlock *ExitBlock = nullptr; + for (auto &BB : F) { + if (isa(BB.getTerminator())) { + if (ExitBlock) + return nullptr; + else + ExitBlock = &BB; + } + } + return ExitBlock; + } + + static bool branchesTo(TerminatorInst *Term, BasicBlock *Succ) { + for (unsigned i = 0; i < Term->getNumSuccessors(); ++i) + if (Term->getSuccessor(i) == Succ) + return true; + return false; + } + + static bool canBeComputedWithoutStack(Value *V) { + if (isa(V)) + return true; + if (isa(V)) + return true; + if (isa(V)) + return true; + if (isa(V)) + return true; + if (isa(V)) + return false; + if (IntrinsicInst *I = dyn_cast(V)) + return + I->getIntrinsicID() == Intrinsic::lifetime_end || + I->getIntrinsicID() == Intrinsic::lifetime_start; + if (isa(V)) + return true; + if (isa(V)) + return false; + // Some casts and extracts require a stack slot for proc <= P7 + // Conservatively assume all casts and extracts use the stack, since there + // is no easy way to query this information from the Target + if (isa(V) || isa(V)) + return false; + + Instruction *I = dyn_cast(V); + if (!I) + return false; + if (I->mayHaveSideEffects()) + return false; + if (I->mayReadFromMemory()) + return false; + + for (unsigned i = 0; i < I->getNumOperands(); ++i) + if (!canBeComputedWithoutStack(I->getOperand(i))) + return false; + + return true; + } + + static bool canBeComputedWithoutStack(BasicBlock *exitBlock, + BasicBlock &entry) { + for (auto &I : *exitBlock) { + if (PHINode *Phi = dyn_cast(&I)) { + if (!canBeComputedWithoutStack(Phi->getIncomingValueForBlock(&entry))) + return false; + } else if (!canBeComputedWithoutStack(&I)) { + return false; + } + } + return true; + } + + static bool hasCallInst(Function &F) { + for (auto &BB : F) + for (auto &I : BB) + if (isa(I) && !isa(I)) + return true; + return false; + } + + static bool hasVariadicCall(Function &F) { + for (auto &BB : F) + for (auto &I : BB) + if (CallInst *CI = dyn_cast(&I)) + if (CI->getFunctionType()->isVarArg()) + return true; + return false; + } + + static bool mayHaveSideEffects(BasicBlock &BB) { + for (auto &I : BB) + if (I.mayHaveSideEffects()) + return true; + return false; + } + +public: + static char ID; + PPCEnableShrinkWrap() : ModulePass(ID) { + initializePPCEnableShrinkWrapPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + ModulePass::getAnalysisUsage(AU); + } + + bool runOnModule(Module &M) { + bool Changed = false; + if (DisablePPCEnableShrink) return Changed; + SmallVector Funcs; + for (auto &F : M) + Funcs.emplace_back(&F); + for (auto &F : Funcs) + if (!F->isDeclaration()) + Changed |= runOnFunction(*F); + return Changed; + } + + static bool runOnFunction(Function &F) { + // Presently, only implemented for non-variadic functions that don't call + // variadic functions and that contain at least one callsite. + if (F.isVarArg()) + return false; + if (!hasCallInst(F)) + return false; + if (hasVariadicCall(F)) + return false; + + // Only unique exit blocks are handled + BasicBlock *ExitBlock = getUniqueExitBlock(F); + if (!ExitBlock) + return false; + + // Look for the pattern of entry blocks that immediately branch to exit + // where the entry block and the exit block can be computed without the + // stack. + BasicBlock &Entry = F.getEntryBlock(); + BranchInst *EntryTerm = dyn_cast(Entry.getTerminator()); + if (!EntryTerm) + return false; + if (EntryTerm->isUnconditional()) + return false; + if (isa(EntryTerm->getCondition())) + return false; + if (!branchesTo(EntryTerm, ExitBlock)) + return false; + if (!canBeComputedWithoutStack(EntryTerm)) + return false; + if (!canBeComputedWithoutStack(ExitBlock, Entry)) + return false; + if (mayHaveSideEffects(Entry)) + return false; + + if (F.getCallingConv() != CallingConv::C) + return false; + + F.setCallingConv(CallingConv::CXX_FAST_TLS); + + // Privatize the early exit path + ValueToValueMapTy VMap; + BasicBlock *PrivExitBlock = CloneBasicBlock(ExitBlock, VMap, ".private", &F); + for (auto I = PrivExitBlock->begin(); &*I != PrivExitBlock->getFirstNonPHI(); ++I) { + PHINode *Phi = cast(&*I); + int EntryIndex = Phi->getBasicBlockIndex(&Entry); + for (int Index = Phi->getNumIncomingValues() - 1; Index <= 0; --Index) + if (Index != EntryIndex) + Phi->removeIncomingValue(Index); + } + + for (auto I = PrivExitBlock->begin(); &*I != PrivExitBlock->getFirstNonPHI(); ++I) { + PHINode *Phi = cast(&*I); + int EntryIndex = Phi->getBasicBlockIndex(&Entry); + for (int Index = Phi->getNumIncomingValues() - 1; Index <= 0; --Index) + if (Index == EntryIndex) + Phi->removeIncomingValue(Index); + } + + for (unsigned i = 0; i < EntryTerm->getNumSuccessors(); ++i) + if (EntryTerm->getSuccessor(i) == ExitBlock) + EntryTerm->setSuccessor(i, PrivExitBlock); + + return true; + } +}; + +char PPCEnableShrinkWrap::ID = 0; +INITIALIZE_PASS(PPCEnableShrinkWrap, "create-wrapper-opps", + "Find opportunities for wrapping", false, false) + +ModulePass *llvm::createPPCEnableShrinkWrapPass() { + return new PPCEnableShrinkWrap(); +} Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -74,6 +74,7 @@ PassRegistry &PR = *PassRegistry::getPassRegistry(); initializePPCBoolRetToIntPass(PR); + initializePPCEnableShrinkWrapPass(PR); } /// Return the datalayout string of a subtarget. @@ -304,6 +305,8 @@ void PPCPassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createPPCBoolRetToIntPass()); + if (TM->getOptLevel() > CodeGenOpt::Default) + addPass(createPPCEnableShrinkWrapPass()); addPass(createAtomicExpandPass(&getPPCTargetMachine())); // For the BG/Q (or if explicitly requested), add explicit data prefetch Index: test/CodeGen/PowerPC/EnableShrinkWrapTest.ll =================================================================== --- test/CodeGen/PowerPC/EnableShrinkWrapTest.ll +++ test/CodeGen/PowerPC/EnableShrinkWrapTest.ll @@ -0,0 +1,100 @@ +; RUN: opt -create-wrapper-opps -S -o - < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; CHECK-LABEL: cxx_fast_tlscc signext i32 @testEnableShrinkWrap +define signext i32 @testEnableShrinkWrap(i32 signext %i) { +entry: + %tobool = icmp eq i32 %i, 0 +; CHECK: br i1 %tobool, label %if.end.private, label %if.then + br i1 %tobool, label %if.end, label %if.then + +; CHECK-LABEL: if.then: +if.then: ; preds = %entry + tail call void @doSomething() +; CHECK: br label %if.end + br label %if.end +; CHECK-LABEL: if.end.private: +if.end: ; preds = %entry, %if.then + ret i32 %i +} + +; CHECK-LABEL: doSomething +declare void @doSomething() + +; CHECK-LABEL: define signext i32 @testEnableShrinkWrapVariadic +define signext i32 @testEnableShrinkWrapVariadic(i32 signext %i, ...) { +entry: + %tobool = icmp eq i32 %i, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + tail call void @doSomething() + br label %if.end + +if.end: ; preds = %entry, %if.then + ret i32 %i +} + +; CHECK-LABEL: define signext i32 @callVariadic +define signext i32 @callVariadic(i32 signext %i) { +entry: + %tobool = icmp eq i32 %i, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + tail call void (i32, ...) @doSomethingVariadic(i32 signext %i, i32 signext 7) + br label %if.end + +if.end: ; preds = %entry, %if.then + ret i32 %i +} + +; CHECK-LABEL: declare void @doSomethingVariadic +declare void @doSomethingVariadic(i32 signext, ...) + +; CHECK-LABEL: define signext i32 @tooComplicatedControlFlow +define signext i32 @tooComplicatedControlFlow(i32 zeroext %i, i32 zeroext %j) { +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %i.addr.0 = phi i32 [ %i, %entry ], [ %dec, %do.body ] +; CHECK: call void @doSomething + tail call void @doSomething() + %dec = add i32 %i.addr.0, -1 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %do.body.1.preheader, label %do.body + +do.body.1.preheader: ; preds = %do.body + br label %do.body.1 + +do.body.1: ; preds = %do.body.1.preheader, %do.body.1 + %j.addr.0 = phi i32 [ %dec3, %do.body.1 ], [ %j, %do.body.1.preheader ] +; CHECK: call void @doSomething + tail call void @doSomething() + %dec3 = add i32 %j.addr.0, -1 + %tobool4 = icmp eq i32 %dec3, 0 + br i1 %tobool4, label %do.end.5, label %do.body.1 + +do.end.5: ; preds = %do.body.1 + ret i32 7 +} + +; CHECK-LABEL: define signext i32 @notSideEffectFree +define signext i32 @notSideEffectFree(i32* nocapture readonly %i) { +entry: + %incdec.ptr = getelementptr inbounds i32, i32* %i, i64 1 + %0 = load i32, i32* %i, align 4 + %tobool = icmp eq i32 %0, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + tail call void @doSomething() + br label %if.end + +if.end: ; preds = %entry, %if.then + %1 = load i32, i32* %incdec.ptr, align 4 + ret i32 %1 +} \ No newline at end of file