Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -541,6 +541,10 @@ /// containing this constant value for the target. bool shouldBuildLookupTablesForConstant(Constant *C) const; + /// \brief Return true if the input function which is cold at all call sites, + /// should use coldcc calling convention. + bool useColdCCForColdCall(Function &F) const; + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; unsigned getOperandsScalarizationOverhead(ArrayRef Args, @@ -998,6 +1002,7 @@ virtual unsigned getJumpBufSize() = 0; virtual bool shouldBuildLookupTables() = 0; virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0; + virtual bool useColdCCForColdCall(Function &F) = 0; virtual unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, @@ -1243,6 +1248,10 @@ bool shouldBuildLookupTablesForConstant(Constant *C) override { return Impl.shouldBuildLookupTablesForConstant(C); } + bool useColdCCForColdCall(Function &F) override { + return Impl.useColdCCForColdCall(F); + } + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) override { return Impl.getScalarizationOverhead(Ty, Insert, Extract); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -284,6 +284,8 @@ bool shouldBuildLookupTables() { return true; } bool shouldBuildLookupTablesForConstant(Constant *C) { return true; } + bool useColdCCForColdCall(Function &F) { return false; } + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { return 0; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -231,6 +231,10 @@ return TTIImpl->shouldBuildLookupTablesForConstant(C); } +bool TargetTransformInfo::useColdCCForColdCall(Function &F) const { + return TTIImpl->useColdCCForColdCall(F); +} + unsigned TargetTransformInfo:: getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const { return TTIImpl->getScalarizationOverhead(Ty, Insert, Extract); Index: lib/Target/PowerPC/PPCCallingConv.td =================================================================== --- lib/Target/PowerPC/PPCCallingConv.td +++ lib/Target/PowerPC/PPCCallingConv.td @@ -45,6 +45,29 @@ CCCustom<"CC_PPC_AnyReg_Error"> ]>; +// Return-value convention for PowerPC coldcc. +def RetCC_PPC_Cold : CallingConv<[ + // Use the same return registers as RetCC_PPC, but limited to only + // one return value. The remaining return values will be saved to + // the stack. + CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType>>, + CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType>>, + + CCIfType<[i32], CCAssignToReg<[R3]>>, + CCIfType<[i64], CCAssignToReg<[X3]>>, + CCIfType<[i128], CCAssignToReg<[X3]>>, + + CCIfType<[f32], CCAssignToReg<[F1]>>, + CCIfType<[f64], CCAssignToReg<[F1]>>, + + CCIfType<[v4f64, v4f32, v4i1], + CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>, + + CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], + CCIfSubtarget<"hasAltivec()", + CCAssignToReg<[V2]>>> +]>; + // Return-value convention for PowerPC def RetCC_PPC : CallingConv<[ CCIfCC<"CallingConv::AnyReg", CCDelegateTo>, @@ -271,6 +294,36 @@ def CSR_NoRegs : CalleeSavedRegs<(add)>; +// coldcc calling convection marks most registers as non-volatile. +// Do not include r1 since the stack pointer is never considered a CSR. +// Do not include r2, since it is the TOC register and is added depending +// on wether or not the function uses the TOC and is a non-leaf. +// Do not include r0,r11,r13 as they are optional in functional linkage +// and value may be altered by inter-library calls. +// Do not include r12 as it is used as a scratch register. +// Do not include return registers r3, f1, v2. +def CSR_SVR32_ColdCC : CalleeSavedRegs<(add (sequence "R%u", 4, 10), + (sequence "R%u", 14, 31), + F0, (sequence "F%u", 2, 31), + (sequence "CR%u", 0, 7))>; + +def CSR_SVR32_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR32_ColdCC, + (sequence "V%u", 0, 1), + (sequence "V%u", 3, 31))>; + +def CSR_SVR64_ColdCC : CalleeSavedRegs<(add (sequence "X%u", 4, 10), + (sequence "X%u", 14, 31), + F0, (sequence "F%u", 2, 31), + (sequence "CR%u", 0, 7))>; + +def CSR_SVR64_ColdCC_R2: CalleeSavedRegs<(add CSR_SVR64_ColdCC, X2)>; + +def CSR_SVR64_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR64_ColdCC, + (sequence "V%u", 0, 1), + (sequence "V%u", 3, 31))>; + +def CSR_SVR64_ColdCC_R2_Altivec : CalleeSavedRegs<(add CSR_SVR64_ColdCC_Altivec, X2)>; + def CSR_64_AllRegs: CalleeSavedRegs<(add X0, (sequence "X%u", 3, 10), (sequence "X%u", 14, 31), (sequence "F%u", 0, 31), Index: lib/Target/PowerPC/PPCFastISel.cpp =================================================================== --- lib/Target/PowerPC/PPCFastISel.cpp +++ lib/Target/PowerPC/PPCFastISel.cpp @@ -206,6 +206,8 @@ return CC_PPC32_SVR4_ByVal; else if (Flag == 3) return CC_PPC32_SVR4_VarArg; + else if (Flag == 4) + return RetCC_PPC_Cold; else return RetCC_PPC; } Index: lib/Target/PowerPC/PPCFrameLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCFrameLowering.cpp +++ lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1950,7 +1950,14 @@ bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4; // Add the callee-saved register as live-in; it's killed at the spill. - MBB.addLiveIn(Reg); + // Do not do this for callee-saved registers that are live-in to the + // function because they will already be marked live-in and this will be + // adding it for a second time. It is an error to add the same register + // to the set more than once. + const MachineRegisterInfo &MRI = MF->getRegInfo(); + bool IsLiveIn = MRI.isLiveIn(Reg); + if (!IsLiveIn) + MBB.addLiveIn(Reg); if (CRSpilled && IsCRField) { CRMIB.addReg(Reg, RegState::ImplicitKill); @@ -1980,7 +1987,10 @@ } } else { const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, true, + // Use !IsLiveIn for the kill flag. + // We do not want to kill registers that are live in this function + // before their use because they will become undefined registers. + TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, CSI[i].getFrameIdx(), RC, TRI); } } Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -4916,7 +4916,11 @@ SmallVector RVLocs; CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); - CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); + + CCRetInfo.AnalyzeCallResult( + Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) + ? RetCC_PPC_Cold + : RetCC_PPC); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { @@ -5136,6 +5140,7 @@ // of the 32-bit SVR4 ABI stack frame layout. assert((CallConv == CallingConv::C || + CallConv == CallingConv::Cold || CallConv == CallingConv::Fast) && "Unknown calling convention!"); unsigned PtrByteSize = 4; @@ -6397,7 +6402,10 @@ LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); - return CCInfo.CheckReturn(Outs, RetCC_PPC); + return CCInfo.CheckReturn( + Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) + ? RetCC_PPC_Cold + : RetCC_PPC); } SDValue @@ -6409,7 +6417,10 @@ SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); - CCInfo.AnalyzeReturn(Outs, RetCC_PPC); + CCInfo.AnalyzeReturn(Outs, + (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) + ? RetCC_PPC_Cold + : RetCC_PPC); SDValue Flag; SmallVector RetOps(1, Chain); Index: lib/Target/PowerPC/PPCRegisterInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCRegisterInfo.cpp +++ lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -144,6 +144,17 @@ // On PPC64, we might need to save r2 (but only if it is not reserved). bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2); + if (MF->getFunction()->getCallingConv() == CallingConv::Cold) { + return TM.isPPC64() + ? (Subtarget.hasAltivec() + ? (SaveR2 ? CSR_SVR64_ColdCC_R2_Altivec_SaveList + : CSR_SVR64_ColdCC_Altivec_SaveList) + : (SaveR2 ? CSR_SVR64_ColdCC_R2_SaveList + : CSR_SVR64_ColdCC_SaveList)) + : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_SaveList + : CSR_SVR32_ColdCC_SaveList); + } + return TM.isPPC64() ? (Subtarget.hasAltivec() ? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList @@ -196,6 +207,13 @@ : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask : CSR_Darwin32_RegMask); + if (CC == CallingConv::Cold) { + return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR64_ColdCC_Altivec_RegMask + : CSR_SVR64_ColdCC_RegMask) + : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_RegMask + : CSR_SVR32_ColdCC_RegMask); + } + return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask : CSR_SVR464_RegMask) : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -61,7 +61,7 @@ /// \name Vector TTI Implementations /// @{ - + bool useColdCCForColdCall(Function &F); bool enableAggressiveInterleaving(bool LoopHasReductions); const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( bool IsZeroCmp) const; Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -27,6 +27,11 @@ CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64), cl::desc("The loop prefetch cache line size")); +static cl::opt +EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), + cl::desc("Enable using coldcc calling conv for cold " + "internal functions")); + //===----------------------------------------------------------------------===// // // PPC cost model. @@ -215,6 +220,14 @@ BaseT::getUnrollingPreferences(L, SE, UP); } +// This function returns true to allow using coldcc calling convention. +// Returning true results in coldcc being used for functions which are cold at +// all call sites when the callers of the functions are not calling any other +// non coldcc functions. +bool PPCTTIImpl::useColdCCForColdCall(Function &F) { + return EnablePPCColdCC; +} + bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { // On the A2, always unroll aggressively. For QPX unaligned loads, we depend // on combining the loads generated for consecutive accesses, and failure to Index: lib/Transforms/IPO/GlobalOpt.cpp =================================================================== --- lib/Transforms/IPO/GlobalOpt.cpp +++ lib/Transforms/IPO/GlobalOpt.cpp @@ -22,9 +22,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -55,6 +57,7 @@ #include "llvm/Pass.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -88,6 +91,21 @@ STATISTIC(NumAliasesResolved, "Number of global aliases resolved"); STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated"); STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); +STATISTIC(NumInternalFunc, "Number of internal functions"); +STATISTIC(NumColdCC, "Number of functions marked coldcc"); + +static cl::opt + EnableColdCCStressTest("enable-coldcc-stress-test", + cl::desc("Enable stress test of coldcc by adding " + "calling conv to all internal functions."), + cl::init(false), cl::Hidden); + +static cl::opt ColdCCRelFreq( + "coldcc-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, + cl::desc( + "Maximum block frequency, expressed as a percentage of caller's " + "entry frequency, for a call site to be considered cold for enabling" + "coldcc")); /// Is this global variable possibly used by a leak checker as a root? If so, /// we might not really want to eliminate the stores to it. @@ -2097,20 +2115,114 @@ /// idea here is that we don't want to mess with the convention if the user /// explicitly requested something with performance implications like coldcc, /// GHC, or anyregcc. -static bool isProfitableToMakeFastCC(Function *F) { +static bool hasChangeableCC(Function *F) { CallingConv::ID CC = F->getCallingConv(); // FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc? return CC == CallingConv::C || CC == CallingConv::X86_ThisCall; } +/// Return true if the block containing the call site has a BlockFrequency of +/// less than ColdCCRelFreq% of the entry block. +static bool isColdCallSite(CallSite CS, BlockFrequencyInfo &CallerBFI) { + const BranchProbability ColdProb(ColdCCRelFreq, 100); + auto CallSiteBB = CS.getInstruction()->getParent(); + auto CallSiteFreq = CallerBFI.getBlockFreq(CallSiteBB); + auto CallerEntryFreq = + CallerBFI.getBlockFreq(&(CS.getCaller()->getEntryBlock())); + return CallSiteFreq < CallerEntryFreq * ColdProb; +} + +// This function checks if the input function F is cold at all call sites. It +// also looks each call site's containing function, returning false if the +// caller function contains other non cold calls. The input vector AllCallsCold +// contains a list of functions that only have call sites in cold blocks. +static bool +isValidCandidateForColdCC(Function &F, + function_ref GetBFI, + const std::vector &AllCallsCold) { + + if (F.user_empty()) + return false; + + for (User *U : F.users()) { + if (isa(U)) + continue; + + CallSite CS(cast(U)); + Function *CallerFunc = CS.getInstruction()->getParent()->getParent(); + BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc); + if (!isColdCallSite(CS, CallerBFI)) + return false; + auto It = std::find(AllCallsCold.begin(), AllCallsCold.end(), CallerFunc); + if (It == AllCallsCold.end()) + return false; + } + return true; +} + +static void changeCallSitesToColdCC(Function *F) { + for (User *U : F->users()) { + if (isa(U)) + continue; + CallSite CS(cast(U)); + CS.setCallingConv(CallingConv::Cold); + } +} + +// This function iterates over all the call instructions in the input Function +// and checks that all call sites are in cold blocks and are allowed to use the +// coldcc calling convention. +static bool +hasOnlyColdCalls(Function &F, + function_ref GetBFI) { + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (CallInst *CI = dyn_cast(&I)) { + CallSite CS(cast(CI)); + // Skip over isline asm instructions since they aren't function calls. + if (CI->isInlineAsm()) + continue; + Function *CalledFn = CI->getCalledFunction(); + if (!CalledFn) + return false; + if (!CalledFn->hasLocalLinkage()) + return false; + // Skip over instrinsics since they won't remain as function calls. + if (CalledFn->getIntrinsicID() != Intrinsic::not_intrinsic) + continue; + // Check if it's valid to use coldcc calling convention. + if (!hasChangeableCC(CalledFn) || CalledFn->isVarArg() || + CalledFn->hasAddressTaken()) + return false; + BlockFrequencyInfo &CallerBFI = GetBFI(F); + if (!isColdCallSite(CS, CallerBFI)) + return false; + } + } + } + return true; +} + static bool OptimizeFunctions(Module &M, TargetLibraryInfo *TLI, + function_ref GetTTI, + function_ref GetBFI, function_ref LookupDomTree, SmallSet &NotDiscardableComdats) { + bool Changed = false; + + std::vector AllCallsCold; + for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) { + Function *F = &*FI++; + if (hasOnlyColdCalls(*F, GetBFI)) + AllCallsCold.push_back(F); + } + // Optimize functions. for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) { Function *F = &*FI++; + // Functions without names cannot be referenced outside this module. if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage()) F->setLinkage(GlobalValue::InternalLinkage); @@ -2142,7 +2254,25 @@ if (!F->hasLocalLinkage()) continue; - if (isProfitableToMakeFastCC(F) && !F->isVarArg() && + + if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) { + NumInternalFunc++; + TargetTransformInfo &TTI = GetTTI(*F); + // Change the calling convention to coldcc if either stress testing is + // enabled or the target would like to use coldcc on functions which are + // cold at all call sites and the callers contain no other non coldcc + // calls. + if (EnableColdCCStressTest || + (isValidCandidateForColdCC(*F, GetBFI, AllCallsCold) && + TTI.useColdCCForColdCall(*F))) { + F->setCallingConv(CallingConv::Cold); + changeCallSitesToColdCC(F); + Changed = true; + NumColdCC++; + } + } + + if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) { // If this function has a calling convention worth changing, is not a // varargs function, and is only called directly, promote it to use the @@ -2619,6 +2749,8 @@ static bool optimizeGlobalsInModule( Module &M, const DataLayout &DL, TargetLibraryInfo *TLI, + function_ref GetTTI, + function_ref GetBFI, function_ref LookupDomTree) { SmallSet NotDiscardableComdats; bool Changed = false; @@ -2641,8 +2773,8 @@ NotDiscardableComdats.insert(C); // Delete functions that are trivially dead, ccc -> fastcc - LocalChange |= - OptimizeFunctions(M, TLI, LookupDomTree, NotDiscardableComdats); + LocalChange |= OptimizeFunctions(M, TLI, GetTTI, GetBFI, LookupDomTree, + NotDiscardableComdats); // Optimize global_ctors list. LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) { @@ -2679,7 +2811,15 @@ auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{ return FAM.getResult(F); }; - if (!optimizeGlobalsInModule(M, DL, &TLI, LookupDomTree)) + auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { + return FAM.getResult(F); + }; + + auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { + return FAM.getResult(F); + }; + + if (!optimizeGlobalsInModule(M, DL, &TLI, GetTTI, GetBFI, LookupDomTree)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } @@ -2702,11 +2842,21 @@ auto LookupDomTree = [this](Function &F) -> DominatorTree & { return this->getAnalysis(F).getDomTree(); }; - return optimizeGlobalsInModule(M, DL, TLI, LookupDomTree); + auto GetTTI = [this](Function &F) -> TargetTransformInfo & { + return this->getAnalysis().getTTI(F); + }; + + auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & { + return this->getAnalysis(F).getBFI(); + }; + + return optimizeGlobalsInModule(M, DL, TLI, GetTTI, GetBFI, LookupDomTree); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addRequired(); } }; @@ -2718,6 +2868,8 @@ INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt", "Global Variable Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(GlobalOptLegacyPass, "globalopt", "Global Variable Optimizer", false, false) Index: test/CodeGen/PowerPC/coldcc.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/coldcc.ll @@ -0,0 +1,46 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefix=COLDCC + +define signext i32 @caller(i32 signext %a, i32 signext %b, i32 signext %cold) { +entry: + %0 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{r14},~{r15},~{r16},~{r17},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"(i32 %a, i32 %b) + %mul = mul nsw i32 %0, %cold + %tobool = icmp eq i32 %cold, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %mul1 = mul nsw i32 %mul, %cold + %mul2 = mul nsw i32 %b, %a + %call = tail call coldcc signext i32 @callee(i32 signext %a, i32 signext %b) + %add = add i32 %mul2, %a + %add3 = add i32 %add, %mul + %add4 = add i32 %add3, %mul1 + %add5 = add i32 %add4, %call + br label %if.end + +if.end: ; preds = %entry, %if.then + %f.0 = phi i32 [ %add5, %if.then ], [ %0, %entry ] + ret i32 %f.0 +} + +define internal coldcc signext i32 @callee(i32 signext %a, i32 signext %b) local_unnamed_addr #0 { +entry: +; COLDCC: @callee +; COLDCC: std 6, -8(1) +; COLDCC: std 7, -16(1) +; COLDCC: std 8, -24(1) +; COLDCC: std 9, -32(1) +; COLDCC: std 10, -40(1) +; COLDCC: ld 9, -32(1) +; COLDCC: ld 8, -24(1) +; COLDCC: ld 7, -16(1) +; COLDCC: ld 10, -40(1) +; COLDCC: ld 6, -8(1) + %0 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{r6},~{r7},~{r8},~{r9},~{r10}"(i32 %a, i32 %b) + %mul = mul nsw i32 %a, 3 + %1 = mul i32 %b, -5 + %add = add i32 %1, %mul + %sub = add i32 %add, %0 + ret i32 %sub +} + +attributes #0 = { noinline } Index: test/CodeGen/PowerPC/coldcc2.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/coldcc2.ll @@ -0,0 +1,42 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefix=COLDCC + +%struct.MyStruct = type { i32, i32, i32, i32 } + +@caller.s = internal unnamed_addr global %struct.MyStruct zeroinitializer, align 8 + +define signext i32 @caller(i32 signext %a, i32 signext %b, i32 signext %cold) { +entry: +; COLDCC: bl callee +; COLDCC: ld 4, 40(1) +; COLDCC: ld 5, 32(1) + %call = tail call coldcc { i64, i64 } @callee(i32 signext %a, i32 signext %b) + %0 = extractvalue { i64, i64 } %call, 0 + %1 = extractvalue { i64, i64 } %call, 1 + store i64 %0, i64* bitcast (%struct.MyStruct* @caller.s to i64*), align 8 + store i64 %1, i64* bitcast (i32* getelementptr inbounds (%struct.MyStruct, %struct.MyStruct* @caller.s, i64 0, i32 2) to i64*), align 8 + %2 = lshr i64 %1, 32 + %3 = trunc i64 %2 to i32 + %sub = sub nsw i32 0, %3 + ret i32 %sub +} + +define internal coldcc { i64, i64 } @callee(i32 signext %a, i32 signext %b) { +entry: +; COLDCC: std {{[0-9]+}}, 0(3) +; COLDCC: std {{[0-9]+}}, 8(3) + %0 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{r6},~{r7},~{r8},~{r9},~{r10}"(i32 %a, i32 %b) + %mul = mul nsw i32 %a, 3 + %1 = mul i32 %b, -5 + %add = add i32 %1, %mul + %sub = add i32 %add, %0 + %mul5 = mul nsw i32 %b, %a + %add6 = add nsw i32 %sub, %mul5 + %retval.sroa.0.0.insert.ext = zext i32 %0 to i64 + %retval.sroa.3.8.insert.ext = zext i32 %sub to i64 + %retval.sroa.3.12.insert.ext = zext i32 %add6 to i64 + %retval.sroa.3.12.insert.shift = shl nuw i64 %retval.sroa.3.12.insert.ext, 32 + %retval.sroa.3.12.insert.insert = or i64 %retval.sroa.3.12.insert.shift, %retval.sroa.3.8.insert.ext + %.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.insert.ext, 0 + %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.3.12.insert.insert, 1 + ret { i64, i64 } %.fca.1.insert +} Index: test/Other/pass-pipelines.ll =================================================================== --- test/Other/pass-pipelines.ll +++ test/Other/pass-pipelines.ll @@ -93,7 +93,7 @@ ; FIXME: There really shouldn't be another pass manager, especially one that ; just builds the domtree. It doesn't even run the verifier. ; CHECK-O2: Pass Arguments: -; CHECK-O2-NEXT: FunctionPass Manager +; CHECK-O2: FunctionPass Manager ; CHECK-O2-NEXT: Dominator Tree Construction define void @foo() { Index: test/Transforms/GlobalOpt/PowerPC/coldcc_coldsites.ll =================================================================== --- /dev/null +++ test/Transforms/GlobalOpt/PowerPC/coldcc_coldsites.ll @@ -0,0 +1,81 @@ +; RUN: opt -globalopt -mtriple=powerpc64le-unknown-linux-gnu -ppc-enable-coldcc -S < %s | FileCheck %s -check-prefix=COLDCC +; RUN: opt -globalopt -S < %s | FileCheck %s -check-prefix=CHECK + +define signext i32 @caller(i32 signext %a, i32 signext %b, i32 signext %lim, i32 signext %i) local_unnamed_addr #0 !prof !30 { +entry: +; COLDCC: call coldcc signext i32 @callee +; CHECK: call fastcc signext i32 @callee + %add = add nsw i32 %b, %a + %sub = add nsw i32 %lim, -1 + %cmp = icmp eq i32 %sub, %i + br i1 %cmp, label %if.then, label %if.end, !prof !31 + +if.then: ; preds = %entry + %call = tail call signext i32 @callee(i32 signext %a, i32 signext %b) + br label %if.end + +if.end: ; preds = %if.then, %entry + %f.0 = phi i32 [ %call, %if.then ], [ %add, %entry ] + ret i32 %f.0 +} + +define internal signext i32 @callee(i32 signext %a, i32 signext %b) unnamed_addr #0 { +entry: + %0 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{r6},~{r7},~{r8},~{r9}"(i32 %a, i32 %b) #1, !srcloc !32 + %mul = mul nsw i32 %a, 3 + %mul1 = shl i32 %0, 1 + %add = add nsw i32 %mul1, %mul + ret i32 %add +} + +define signext i32 @main() local_unnamed_addr #0 !prof !33 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + ret i32 %add.lcssa + +for.body: ; preds = %for.body, %entry + %i.011 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %ret.010 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %call = tail call signext i32 @caller(i32 signext 4, i32 signext 5, i32 signext 10000000, i32 signext %i.011) + %add = add nsw i32 %call, %ret.010 + %inc = add nuw nsw i32 %i.011, 1 + %exitcond = icmp eq i32 %inc, 10000000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !34 +} +attributes #0 = { noinline } + +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 20000003} +!4 = !{!"MaxCount", i64 10000000} +!5 = !{!"MaxInternalCount", i64 10000000} +!6 = !{!"MaxFunctionCount", i64 10000000} +!7 = !{!"NumCounts", i64 5} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13, !14, !15, !16, !16, !17, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26} +!11 = !{i32 10000, i64 10000000, i32 2} +!12 = !{i32 100000, i64 10000000, i32 2} +!13 = !{i32 200000, i64 10000000, i32 2} +!14 = !{i32 300000, i64 10000000, i32 2} +!15 = !{i32 400000, i64 10000000, i32 2} +!16 = !{i32 500000, i64 10000000, i32 2} +!17 = !{i32 600000, i64 10000000, i32 2} +!18 = !{i32 700000, i64 10000000, i32 2} +!19 = !{i32 800000, i64 10000000, i32 2} +!20 = !{i32 900000, i64 10000000, i32 2} +!21 = !{i32 950000, i64 10000000, i32 2} +!22 = !{i32 990000, i64 10000000, i32 2} +!23 = !{i32 999000, i64 10000000, i32 2} +!24 = !{i32 999900, i64 10000000, i32 2} +!25 = !{i32 999990, i64 10000000, i32 2} +!26 = !{i32 999999, i64 10000000, i32 2} +!30 = !{!"function_entry_count", i64 10000000} +!31 = !{!"branch_weights", i32 2, i32 10000000} +!32 = !{i32 59} +!33 = !{!"function_entry_count", i64 1} +!34 = !{!"branch_weights", i32 2, i32 10000001} Index: test/Transforms/GlobalOpt/PowerPC/lit.local.cfg =================================================================== --- /dev/null +++ test/Transforms/GlobalOpt/PowerPC/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'PowerPC' in config.root.targets: + config.unsupported = True + Index: test/Transforms/GlobalOpt/coldcc_stress_test.ll =================================================================== --- /dev/null +++ test/Transforms/GlobalOpt/coldcc_stress_test.ll @@ -0,0 +1,48 @@ +; RUN: opt < %s -globalopt -S -enable-coldcc-stress-test -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=COLDCC +; RUN: opt < %s -globalopt -S | FileCheck %s -check-prefix=CHECK + +define internal i32 @callee_default(i32* %m) { +; COLDCC-LABEL: define internal coldcc i32 @callee_default +; CHECK-LABEL: define internal fastcc i32 @callee_default + %v = load i32, i32* %m + ret i32 %v +} + +define internal fastcc i32 @callee_fastcc(i32* %m) { +; COLDCC-LABEL: define internal fastcc i32 @callee_fastcc +; CHECK-LABEL: define internal fastcc i32 @callee_fastcc + %v = load i32, i32* %m + ret i32 %v +} + +define internal coldcc i32 @callee_coldcc(i32* %m) { +; COLDCC-LABEL: define internal coldcc i32 @callee_coldcc +; CHECK-LABEL: define internal coldcc i32 @callee_coldcc + %v = load i32, i32* %m + ret i32 %v +} + +define i32 @callee(i32* %m) { + %v = load i32, i32* %m + ret i32 %v +} + +define void @caller() { + %m = alloca i32 + call i32 @callee_default(i32* %m) + call fastcc i32 @callee_fastcc(i32* %m) + call coldcc i32 @callee_coldcc(i32* %m) + call i32 @callee(i32* %m) + ret void +} + +; COLDCC-LABEL: define void @caller() +; COLDCC: call coldcc i32 @callee_default +; COLDCC: call fastcc i32 @callee_fastcc +; COLDCC: call coldcc i32 @callee_coldcc +; COLDCC: call i32 @callee +; CHECK-LABEL: define void @caller() +; CHECK: call fastcc i32 @callee_default +; CHECK: call fastcc i32 @callee_fastcc +; CHECK: call coldcc i32 @callee_coldcc +; CHECK: call i32 @callee