Index: lib/CodeGen/TargetFrameLoweringImpl.cpp =================================================================== --- lib/CodeGen/TargetFrameLoweringImpl.cpp +++ lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -63,12 +63,12 @@ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF); + SavedRegs.resize(TRI.getNumRegs()); + // Early exit if there are no callee saved registers. if (!CSRegs || CSRegs[0] == 0) return; - SavedRegs.resize(TRI.getNumRegs()); - // In Naked functions we aren't going to save any registers. if (MF.getFunction()->hasFnAttribute(Attribute::Naked)) return; Index: lib/Target/PowerPC/PPCFastISel.cpp =================================================================== --- lib/Target/PowerPC/PPCFastISel.cpp +++ lib/Target/PowerPC/PPCFastISel.cpp @@ -196,7 +196,7 @@ #include "PPCGenCallingConv.inc" -// Function whose sole purpose is to kill compiler warnings +// Function whose sole purpose is to kill compiler warnings // stemming from unused functions included from PPCGenCallingConv.inc. CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) { if (Flag == 1) @@ -442,7 +442,7 @@ // Emit a load instruction if possible, returning true if we succeeded, // otherwise false. See commentary below for how the register class of -// the load is determined. +// the load is determined. bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, const TargetRegisterClass *RC, bool IsZExt, unsigned FP64LoadOpc) { @@ -474,11 +474,11 @@ break; case MVT::i16: Opc = (IsZExt ? - (Is32BitInt ? PPC::LHZ : PPC::LHZ8) : + (Is32BitInt ? PPC::LHZ : PPC::LHZ8) : (Is32BitInt ? PPC::LHA : PPC::LHA8)); break; case MVT::i32: - Opc = (IsZExt ? + Opc = (IsZExt ? (Is32BitInt ? PPC::LWZ : PPC::LWZ8) : (Is32BitInt ? PPC::LWA_32 : PPC::LWA)); if ((Opc == PPC::LWA || Opc == PPC::LWA_32) && ((Addr.Offset & 3) != 0)) @@ -486,7 +486,7 @@ break; case MVT::i64: Opc = PPC::LD; - assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) && + assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) && "64-bit load with 32-bit target??"); UseOffset = ((Addr.Offset & 3) == 0); break; @@ -674,7 +674,7 @@ } else if (UseOffset) { // VSX only provides an indexed store. if (Is32VSXStore || Is64VSXStore) return false; - + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg); @@ -812,7 +812,7 @@ long Imm = 0; bool UseImm = false; - // Only 16-bit integer constants can be represented in compares for + // Only 16-bit integer constants can be represented in compares for // PowerPC. Others will be materialized into a register. if (const ConstantInt *ConstInt = dyn_cast(SrcValue2)) { if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 || @@ -1583,6 +1583,9 @@ if (!FuncInfo.CanLowerReturn) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + const ReturnInst *Ret = cast(I); const Function &F = *I->getParent()->getParent(); @@ -1599,7 +1602,7 @@ CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, *Context); CCInfo.AnalyzeReturn(Outs, RetCC_PPC64_ELF_FIS); const Value *RV = Ret->getOperand(0); - + // FIXME: Only one output register for now. if (ValLocs.size() > 1) return false; @@ -1645,7 +1648,7 @@ if (RVVT != DestVT && RVVT != MVT::i8 && RVVT != MVT::i16 && RVVT != MVT::i32) return false; - + if (RVVT != DestVT) { switch (VA.getLocInfo()) { default: @@ -1919,7 +1922,7 @@ TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) .addImm(0).addReg(TmpReg2); - } else + } else BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO) .addReg(TmpReg) @@ -2011,7 +2014,7 @@ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), ResultReg) .addImm(Hi); - + return ResultReg; } @@ -2245,7 +2248,7 @@ // Handle materializing integer constants into a register. This is not // automatically generated for PowerPC, so must be explicitly created here. unsigned PPCFastISel::fastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) { - + if (Opc != ISD::Constant) return 0; @@ -2259,7 +2262,7 @@ } if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 && - VT != MVT::i8 && VT != MVT::i1) + VT != MVT::i8 && VT != MVT::i1) return 0; const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass : Index: lib/Target/PowerPC/PPCFrameLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCFrameLowering.cpp +++ lib/Target/PowerPC/PPCFrameLowering.cpp @@ -584,13 +584,13 @@ if (MBBI != MBB->begin()) RS.forward(MBBI); } - - if (!RS.isRegUsed(R0)) + + if (!RS.isRegUsed(R0)) return true; unsigned Reg = RS.FindUnusedReg(Subtarget.isPPC64() ? &PPC::G8RCRegClass : &PPC::GPRCRegClass); - + // Make sure the register scavenger was able to find an available register // If not, use R0 but return false to indicate no register was available and // R0 must be used (as recommended by the ABI) @@ -703,7 +703,7 @@ findScratchRegister(&MBB, false, &ScratchReg); assert(ScratchReg && "No scratch register!"); - + int LROffset = getReturnSaveOffset(); int FPOffset = 0; @@ -984,7 +984,7 @@ if (MBBI != MBB.end()) dl = MBBI->getDebugLoc(); - + const PPCInstrInfo &TII = *static_cast(Subtarget.getInstrInfo()); const PPCRegisterInfo *RegInfo = @@ -1026,14 +1026,14 @@ : PPC::ADDI ); const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8 : PPC::ADD4 ); - + int LROffset = getReturnSaveOffset(); int FPOffset = 0; findScratchRegister(&MBB, true, &ScratchReg); assert(ScratchReg && "No scratch register!"); - + if (HasFP) { if (isSVR4ABI) { MachineFrameInfo *FFI = MF.getFrameInfo(); @@ -1066,7 +1066,7 @@ } bool IsReturnBlock = (MBBI != MBB.end() && MBBI->isReturn()); - + if (IsReturnBlock) { unsigned RetOpcode = MBBI->getOpcode(); bool UsesTCRet = RetOpcode == PPC::TCRETURNri || Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -391,7 +391,7 @@ /// a VMRGEW or VMRGOW instruction bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, unsigned ShuffleKind, SelectionDAG &DAG); - + /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the /// shift amount, otherwise return -1. int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, @@ -442,6 +442,18 @@ return true; } + bool supportSplitCSR(MachineFunction *MF) const override { + return + MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const override; + /// getSetCCResultType - Return the ISD::SETCC ValueType EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -5814,6 +5814,25 @@ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + + if (PPC::G8RCRegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else if (PPC::F8RCRegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); + else if (PPC::CRRCRegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i1)); + else if (PPC::VRRCRegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::Other)); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } + RetOps[0] = Chain; // Update chain. // Add the flag if we have it. @@ -11588,3 +11607,57 @@ const TargetLibraryInfo *LibInfo) const { return PPC::createFastISel(FuncInfo, LibInfo); } + +void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + if (Subtarget.isDarwinABI()) return; + if (!Subtarget.isPPC64()) return; + + // Update IsSplitCSR in PPCFunctionInfo + PPCFunctionInfo *PFI = Entry->getParent()->getInfo(); + PFI->setIsSplitCSR(true); +} + +void PPCTargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const { + const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (PPC::G8RCRegClass.contains(*I)) + RC = &PPC::G8RCRegClass; + else if (PPC::F8RCRegClass.contains(*I)) + RC = &PPC::F8RCRegClass; + else if (PPC::CRRCRegClass.contains(*I)) + RC = &PPC::CRRCRegClass; + else if (PPC::VRRCRegClass.contains(*I)) + RC = &PPC::VRRCRegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) + .addReg(*I); + + // Insert the copy-back instructions right before the terminator + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) + .addReg(NewVR); + } +} Index: lib/Target/PowerPC/PPCMachineFunctionInfo.h =================================================================== --- lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -27,7 +27,7 @@ /// stored. Also used as an anchor for instructions that need to be altered /// when using frame pointers (dyna_add, dyna_sub.) int FramePointerSaveIndex; - + /// ReturnAddrSaveIndex - Frame index of where the return address is stored. /// int ReturnAddrSaveIndex; @@ -104,8 +104,12 @@ /// Whether this uses the PIC Base register or not. bool UsesPICBase; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies + bool IsSplitCSR; + public: - explicit PPCFunctionInfo(MachineFunction &MF) + explicit PPCFunctionInfo(MachineFunction &MF) : FramePointerSaveIndex(0), ReturnAddrSaveIndex(0), BasePointerSaveIndex(0), @@ -125,11 +129,12 @@ VarArgsNumFPR(0), CRSpillFrameIndex(0), MF(MF), - UsesPICBase(0) {} + UsesPICBase(0), + IsSplitCSR(false) {} int getFramePointerSaveIndex() const { return FramePointerSaveIndex; } void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; } - + int getReturnAddrSaveIndex() const { return ReturnAddrSaveIndex; } void setReturnAddrSaveIndex(int idx) { ReturnAddrSaveIndex = idx; } @@ -196,6 +201,9 @@ void setUsesPICBase(bool uses) { UsesPICBase = uses; } bool usesPICBase() const { return UsesPICBase; } + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } + MCSymbol *getPICOffsetSymbol() const; MCSymbol *getGlobalEPSymbol() const; Index: lib/Target/PowerPC/PPCRegisterInfo.h =================================================================== --- lib/Target/PowerPC/PPCRegisterInfo.h +++ lib/Target/PowerPC/PPCRegisterInfo.h @@ -75,6 +75,7 @@ /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const override; const uint32_t *getNoPreservedMask() const override; Index: lib/Target/PowerPC/PPCRegisterInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCRegisterInfo.cpp +++ lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -116,9 +116,11 @@ : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_SaveList : CSR_Darwin32_SaveList); + if (TM.isPPC64() && MF->getInfo()->isSplitCSR()) + return CSR_SRV464_TLS_PE_SaveList; + // On PPC64, we might need to save r2 (but only if it is not reserved). bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2); - return TM.isPPC64() ? (Subtarget.hasAltivec() ? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList @@ -128,6 +130,30 @@ : CSR_SVR432_SaveList); } +const MCPhysReg * +PPCRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + const PPCSubtarget &Subtarget = MF->getSubtarget(); + if (Subtarget.isDarwinABI()) + return nullptr; + + // On PPC64, we might need to save r2 (but only if it is not reserved). + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo()->isSplitCSR()) { + bool SaveR2 = !getReservedRegs(*MF).test(PPC::X2); + return TM.isPPC64() + ? (Subtarget.hasAltivec() + ? (SaveR2 + ? CSR_SVR464_R2_Altivec_ViaCopy_SaveList + : CSR_SVR464_Altivec_ViaCopy_SaveList) + : (SaveR2 + ? CSR_SVR464_R2_ViaCopy_SaveList + : CSR_SVR464_ViaCopy_SaveList)) + : nullptr; + } + return nullptr; +} + const uint32_t * PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -74,6 +74,7 @@ PassRegistry &PR = *PassRegistry::getPassRegistry(); initializePPCBoolRetToIntPass(PR); + initializePPCEnableShrinkWrapPass(PR); } /// Return the datalayout string of a subtarget. @@ -304,6 +305,8 @@ void PPCPassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createPPCBoolRetToIntPass()); + if (TM->getOptLevel() > CodeGenOpt::Default) + addPass(createPPCEnableShrinkWrapPass()); addPass(createAtomicExpandPass(&getPPCTargetMachine())); // For the BG/Q (or if explicitly requested), add explicit data prefetch Index: test/CodeGen/PowerPC/cxx_tlscc64.ll =================================================================== --- test/CodeGen/PowerPC/cxx_tlscc64.ll +++ test/CodeGen/PowerPC/cxx_tlscc64.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s +%struct.S = type { i8 } + +@sg = internal thread_local global %struct.S zeroinitializer, align 1 +@__dso_handle = external global i8 +@__tls_guard = internal thread_local unnamed_addr global i1 false +@sum1 = internal thread_local global i32 0, align 4 + +declare void @_ZN1SC1Ev(%struct.S*) +declare void @_ZN1SD1Ev(%struct.S*) +declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*) + +; CHECK-LABEL: _ZTW2sg +define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind { + %.b.i = load i1, i1* @__tls_guard, align 1 + br i1 %.b.i, label %__tls_init.exit, label %init.i + +init.i: + store i1 true, i1* @__tls_guard, align 1 + tail call void @_ZN1SC1Ev(%struct.S* nonnull @sg) #2 + %1 = tail call i32 @_tlv_atexit(void (i8*)* nonnull bitcast (void (%struct.S*)* @_ZN1SD1Ev to void (i8*)*), i8* nonnull getelementptr inbounds (%struct.S, %struct.S* @sg, i64 0, i32 0), i8* nonnull @__dso_handle) #2 + br label %__tls_init.exit + +__tls_init.exit: + ret %struct.S* @sg +} + +; CHECK-LABEL: _ZTW4sum1 +define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind { + ret i32* @sum1 +} + +define cxx_fast_tlscc i32* @_ZTW4sum2() #0 { + ret i32* @sum1 +} + +attributes #0 = { nounwind "no-frame-pointer-elim"="true" }