Index: lib/Target/AArch64/AArch64CallingConvention.td =================================================================== --- lib/Target/AArch64/AArch64CallingConvention.td +++ lib/Target/AArch64/AArch64CallingConvention.td @@ -288,6 +288,14 @@ (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), (sequence "D%u", 0, 31))>; +// CSRs that are handled by prologue, epilogue. +def CSR_AArch64_CXX_TLS_Darwin_PE + : CalleeSavedRegs<(add LR, FP)>; + +// CSRs that are handled explicitly via copies. +def CSR_AArch64_CXX_TLS_Darwin_ViaCopy + : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>; + // The ELF stub used for TLS-descriptor access saves every feasible // register. Only X0 and LR are clobbered. def CSR_AArch64_TLS_ELF Index: lib/Target/AArch64/AArch64FastISel.cpp =================================================================== --- lib/Target/AArch64/AArch64FastISel.cpp +++ lib/Target/AArch64/AArch64FastISel.cpp @@ -3646,6 +3646,9 @@ if (F.isVarArg()) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + // Build a list of return value registers. SmallVector RetRegs; Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -385,6 +385,14 @@ bool isCheapToSpeculateCtlz() const override { return true; } + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const override; private: bool isExtFreeImpl(const Instruction *Ext) const override; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3271,6 +3271,19 @@ Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (AArch64::GPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else if (AArch64::FPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } RetOps[0] = Chain; // Update chain. @@ -10003,3 +10016,49 @@ IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); } + +void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + // Update IsSplitCSR in AArch64unctionInfo. + AArch64FunctionInfo *AFI = Entry->getParent()->getInfo(); + AFI->setIsSplitCSR(true); +} + +void AArch64TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const { + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (AArch64::GPR64RegClass.contains(*I)) + RC = &AArch64::GPR64RegClass; + else if (AArch64::FPR64RegClass.contains(*I)) + RC = &AArch64::FPR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +} Index: lib/Target/AArch64/AArch64MachineFunctionInfo.h =================================================================== --- lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -72,16 +72,22 @@ /// registers. unsigned VarArgsFPRSize; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR; + public: AArch64FunctionInfo() : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {} + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) {} explicit AArch64FunctionInfo(MachineFunction &MF) : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) { + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) { (void)MF; } @@ -96,6 +102,9 @@ bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } + void setLocalStackSize(unsigned Size) { LocalStackSize = Size; } unsigned getLocalStackSize() const { return LocalStackSize; } Index: lib/Target/AArch64/AArch64RegisterInfo.h =================================================================== --- lib/Target/AArch64/AArch64RegisterInfo.h +++ lib/Target/AArch64/AArch64RegisterInfo.h @@ -35,6 +35,7 @@ /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; Index: lib/Target/AArch64/AArch64RegisterInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64RegisterInfo.cpp +++ lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -15,6 +15,7 @@ #include "AArch64RegisterInfo.h" #include "AArch64FrameLowering.h" #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" @@ -47,11 +48,22 @@ if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) return CSR_AArch64_AllRegs_SaveList; if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS) - return CSR_AArch64_CXX_TLS_Darwin_SaveList; + return MF->getInfo()->isSplitCSR() ? + CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : + CSR_AArch64_CXX_TLS_Darwin_SaveList; else return CSR_AArch64_AAPCS_SaveList; } +const MCPhysReg * +AArch64RegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo()->isSplitCSR()) + return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList; + return nullptr; +} + const uint32_t * AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { Index: test/CodeGen/AArch64/cxx-tlscc.ll =================================================================== --- test/CodeGen/AArch64/cxx-tlscc.ll +++ test/CodeGen/AArch64/cxx-tlscc.ll @@ -13,7 +13,7 @@ declare %struct.S* @_ZN1SD1Ev(%struct.S* returned) declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*) -define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() { +define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind { %.b.i = load i1, i1* @__tls_guard, align 1 br i1 %.b.i, label %__tls_init.exit, label %init.i @@ -28,50 +28,49 @@ } ; CHECK-LABEL: _ZTW2sg -; CHECK-DAG: stp d31, d30 -; CHECK-DAG: stp d29, d28 -; CHECK-DAG: stp d27, d26 -; CHECK-DAG: stp d25, d24 -; CHECK-DAG: stp d23, d22 -; CHECK-DAG: stp d21, d20 -; CHECK-DAG: stp d19, d18 -; CHECK-DAG: stp d17, d16 -; CHECK-DAG: stp d7, d6 -; CHECK-DAG: stp d5, d4 -; CHECK-DAG: stp d3, d2 -; CHECK-DAG: stp d1, d0 -; CHECK-DAG: stp x20, x19 -; CHECK-DAG: stp x14, x13 -; CHECK-DAG: stp x12, x11 -; CHECK-DAG: stp x10, x9 -; CHECK-DAG: stp x8, x7 -; CHECK-DAG: stp x6, x5 -; CHECK-DAG: stp x4, x3 -; CHECK-DAG: stp x2, x1 -; CHECK-DAG: stp x29, x30 +; CHECK-NOT: stp d31, d30 +; CHECK-NOT: stp d29, d28 +; CHECK-NOT: stp d27, d26 +; CHECK-NOT: stp d25, d24 +; CHECK-NOT: stp d23, d22 +; CHECK-NOT: stp d21, d20 +; CHECK-NOT: stp d19, d18 +; CHECK-NOT: stp d17, d16 +; CHECK-NOT: stp d7, d6 +; CHECK-NOT: stp d5, d4 +; CHECK-NOT: stp d3, d2 +; CHECK-NOT: stp d1, d0 +; CHECK-NOT: stp x20, x19 +; CHECK-NOT: stp x14, x13 +; CHECK-NOT: stp x12, x11 +; CHECK-NOT: stp x10, x9 +; CHECK-NOT: stp x8, x7 +; CHECK-NOT: stp x6, x5 +; CHECK-NOT: stp x4, x3 +; CHECK-NOT: stp x2, x1 ; CHECK: blr ; CHECK: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]] ; CHECK: blr ; CHECK: tlv_atexit ; CHECK: [[BB_end]]: ; CHECK: blr -; CHECK-DAG: ldp x2, x1 -; CHECK-DAG: ldp x4, x3 -; CHECK-DAG: ldp x6, x5 -; CHECK-DAG: ldp x8, x7 -; CHECK-DAG: ldp x10, x9 -; CHECK-DAG: ldp x12, x11 -; CHECK-DAG: ldp x14, x13 -; CHECK-DAG: ldp x20, x19 -; CHECK-DAG: ldp d1, d0 -; CHECK-DAG: ldp d3, d2 -; CHECK-DAG: ldp d5, d4 -; CHECK-DAG: ldp d7, d6 -; CHECK-DAG: ldp d17, d16 -; CHECK-DAG: ldp d19, d18 -; CHECK-DAG: ldp d21, d20 -; CHECK-DAG: ldp d23, d22 -; CHECK-DAG: ldp d25, d24 -; CHECK-DAG: ldp d27, d26 -; CHECK-DAG: ldp d29, d28 -; CHECK-DAG: ldp d31, d30 +; CHECK-NOT: ldp x2, x1 +; CHECK-NOT: ldp x4, x3 +; CHECK-NOT: ldp x6, x5 +; CHECK-NOT: ldp x8, x7 +; CHECK-NOT: ldp x10, x9 +; CHECK-NOT: ldp x12, x11 +; CHECK-NOT: ldp x14, x13 +; CHECK-NOT: ldp x20, x19 +; CHECK-NOT: ldp d1, d0 +; CHECK-NOT: ldp d3, d2 +; CHECK-NOT: ldp d5, d4 +; CHECK-NOT: ldp d7, d6 +; CHECK-NOT: ldp d17, d16 +; CHECK-NOT: ldp d19, d18 +; CHECK-NOT: ldp d21, d20 +; CHECK-NOT: ldp d23, d22 +; CHECK-NOT: ldp d25, d24 +; CHECK-NOT: ldp d27, d26 +; CHECK-NOT: ldp d29, d28 +; CHECK-NOT: ldp d31, d30