diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1571,3 +1571,23 @@ Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], [IntrNoMem]>; } +//===----------------------------------------------------------------------===// +// PowerPC Atomic Intrinsic Definitions. +let TargetPrefix = "ppc" in { + class AtomicRMW128Intrinsic + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_ptr_ty, llvm_i64_ty, llvm_i64_ty], + [IntrArgMemOnly, NoCapture>]>; + def int_ppc_atomicrmw_xchg_i128 : AtomicRMW128Intrinsic; + def int_ppc_atomicrmw_add_i128 : AtomicRMW128Intrinsic; + def int_ppc_atomicrmw_sub_i128 : AtomicRMW128Intrinsic; + def int_ppc_atomicrmw_and_i128 : AtomicRMW128Intrinsic; + def int_ppc_atomicrmw_or_i128 : AtomicRMW128Intrinsic; + def int_ppc_atomicrmw_xor_i128 : AtomicRMW128Intrinsic; + def int_ppc_atomicrmw_nand_i128 : AtomicRMW128Intrinsic; + def int_ppc_cmpxchg_i128 : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_ptr_ty, + llvm_i64_ty, llvm_i64_ty, + llvm_i64_ty, llvm_i64_ty], + [IntrArgMemOnly, NoCapture>]>; +} diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -689,6 +689,8 @@ if (PMV.ValueType == PMV.WordType) { PMV.AlignedAddr = Addr; PMV.AlignedAddrAlignment = AddrAlign; + PMV.ShiftAmt = ConstantInt::get(PMV.ValueType, 0); + PMV.Mask = ConstantInt::get(PMV.ValueType, ~0); return PMV; } diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt --- a/llvm/lib/Target/PowerPC/CMakeLists.txt +++ b/llvm/lib/Target/PowerPC/CMakeLists.txt @@ -27,6 +27,7 @@ PPCCallingConv.cpp PPCCCState.cpp PPCCTRLoops.cpp + PPCExpandAtomicPseudoInsts.cpp PPCHazardRecognizers.cpp PPCInstrInfo.cpp PPCISelDAGToDAG.cpp diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -52,6 +52,7 @@ FunctionPass *createPPCBoolRetToIntPass(); FunctionPass *createPPCExpandISELPass(); FunctionPass *createPPCPreEmitPeepholePass(); + FunctionPass *createPPCExpandAtomicPseudoPass(); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP); bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, @@ -75,6 +76,7 @@ void initializePPCPreEmitPeepholePass(PassRegistry &); void initializePPCTLSDynamicCallPass(PassRegistry &); void initializePPCMIPeepholePass(PassRegistry&); + void initializePPCExpandAtomicPseudoPass(PassRegistry &); extern char &PPCVSXFMAMutateID; diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -161,6 +161,9 @@ def FeaturePartwordAtomic : SubtargetFeature<"partword-atomics", "HasPartwordAtomics", "true", "Enable l[bh]arx and st[bh]cx.">; +def FeatureQuadwordAtomic : SubtargetFeature<"quadword-atomics", + "HasQuadwordAtomics", "true", + "Enable lqarx and stqcx.">; def FeatureInvariantFunctionDescriptors : SubtargetFeature<"invariant-function-descriptors", "HasInvariantFunctionDescriptors", "true", @@ -327,6 +330,7 @@ FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic, + FeatureQuadwordAtomic, FeaturePredictableSelectIsExpensive ]; diff --git a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp @@ -0,0 +1,306 @@ +//===-- PPCExpandAtomicPseudoInsts.cpp - Expand atomic pseudo instrs. -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that expands atomic pseudo instructions into +// target instructions post RA. With such method, LL/SC loop is considered as +// a whole blob and make spilling unlikely happens in the LL/SC loop. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" +#include "PPCInstrInfo.h" +#include "PPCTargetMachine.h" + +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-atomic-expand" + +namespace { + +class PPCExpandAtomicPseudo : public MachineFunctionPass { +public: + const PPCInstrInfo *TII; + const PPCRegisterInfo *TRI; + static char ID; + + PPCExpandAtomicPseudo() : MachineFunctionPass(ID) { + initializePPCExpandAtomicPseudoPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + bool expandMI(MachineBasicBlock &MBB, MachineInstr &MI, + MachineBasicBlock::iterator &NMBBI); + bool expandAtomicRMW128(MachineBasicBlock &MBB, MachineInstr &MI, + MachineBasicBlock::iterator &NMBBI); + bool expandAtomicCmpSwap128(MachineBasicBlock &MBB, MachineInstr &MI, + MachineBasicBlock::iterator &NMBBI); +}; + +static void PairedCopy(const PPCInstrInfo *TII, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + Register Dest0, Register Dest1, Register Src0, + Register Src1) { + const MCInstrDesc &OR = TII->get(PPC::OR8); + const MCInstrDesc &XOR = TII->get(PPC::XOR8); + if (Dest0 == Src1 && Dest1 == Src0) { + // The most tricky case, swapping values. + BuildMI(MBB, MBBI, DL, XOR, Dest0).addReg(Dest0).addReg(Dest1); + BuildMI(MBB, MBBI, DL, XOR, Dest1).addReg(Dest0).addReg(Dest1); + BuildMI(MBB, MBBI, DL, XOR, Dest0).addReg(Dest0).addReg(Dest1); + } else if (Dest0 != Src0 || Dest1 != Src1) { + if (Dest0 == Src1 || Dest1 != Src0) { + BuildMI(MBB, MBBI, DL, OR, Dest1).addReg(Src1).addReg(Src1); + BuildMI(MBB, MBBI, DL, OR, Dest0).addReg(Src0).addReg(Src0); + } else { + BuildMI(MBB, MBBI, DL, OR, Dest0).addReg(Src0).addReg(Src0); + BuildMI(MBB, MBBI, DL, OR, Dest1).addReg(Src1).addReg(Src1); + } + } +} + +bool PPCExpandAtomicPseudo::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + TII = static_cast(MF.getSubtarget().getInstrInfo()); + TRI = &TII->getRegisterInfo(); + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + MachineBasicBlock &MBB = *I; + for (MachineBasicBlock::iterator MBBI = MBB.begin(), MBBE = MBB.end(); + MBBI != MBBE;) { + MachineInstr &MI = *MBBI; + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + Changed |= expandMI(MBB, MI, NMBBI); + MBBI = NMBBI; + } + } + if (Changed) + MF.RenumberBlocks(); + return Changed; +} + +bool PPCExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB, MachineInstr &MI, + MachineBasicBlock::iterator &NMBBI) { + switch (MI.getOpcode()) { + case PPC::ATOMIC_SWAP_I128: + case PPC::ATOMIC_LOAD_ADD_I128: + case PPC::ATOMIC_LOAD_SUB_I128: + case PPC::ATOMIC_LOAD_XOR_I128: + case PPC::ATOMIC_LOAD_NAND_I128: + case PPC::ATOMIC_LOAD_AND_I128: + case PPC::ATOMIC_LOAD_OR_I128: + return expandAtomicRMW128(MBB, MI, NMBBI); + case PPC::ATOMIC_CMP_SWAP_I128: + return expandAtomicCmpSwap128(MBB, MI, NMBBI); + default: + return false; + } +} + +bool PPCExpandAtomicPseudo::expandAtomicRMW128( + MachineBasicBlock &MBB, MachineInstr &MI, + MachineBasicBlock::iterator &NMBBI) { + const MCInstrDesc &LL = TII->get(PPC::LQARX); + const MCInstrDesc &SC = TII->get(PPC::STQCX); + DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = MBB.getParent(); + const BasicBlock *BB = MBB.getBasicBlock(); + // Create layout of control flow. + MachineFunction::iterator MFI = ++MBB.getIterator(); + MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(MFI, LoopMBB); + MF->insert(MFI, ExitMBB); + ExitMBB->splice(ExitMBB->begin(), &MBB, std::next(MI.getIterator()), + MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopMBB); + + // For non-min/max operations, control flow is kinda like: + // MBB: + // ... + // LoopMBB: + // lqarx in, ptr + // addc out.sub_x1, in.sub_x1, op.sub_x1 + // adde out.sub_x0, in.sub_x0, op.sub_x0 + // stqcx out, ptr + // bne- LoopMBB + // ExitMBB: + // ... + Register Old = MI.getOperand(0).getReg(); + Register OldHi = TRI->getSubReg(Old, PPC::sub_gp8_x0); + Register OldLo = TRI->getSubReg(Old, PPC::sub_gp8_x1); + Register Scratch = MI.getOperand(1).getReg(); + Register ScratchHi = TRI->getSubReg(Scratch, PPC::sub_gp8_x0); + Register ScratchLo = TRI->getSubReg(Scratch, PPC::sub_gp8_x1); + Register RA = MI.getOperand(2).getReg(); + Register RB = MI.getOperand(3).getReg(); + Register IncrLo = MI.getOperand(4).getReg(); + Register IncrHi = MI.getOperand(5).getReg(); + unsigned RMWOpcode = MI.getOpcode(); + + MachineBasicBlock *CurrentMBB = LoopMBB; + BuildMI(CurrentMBB, DL, LL, Old).addReg(RA).addReg(RB); + + switch (RMWOpcode) { + case PPC::ATOMIC_SWAP_I128: + PairedCopy(TII, *CurrentMBB, CurrentMBB->end(), DL, ScratchHi, ScratchLo, + IncrHi, IncrLo); + break; + case PPC::ATOMIC_LOAD_ADD_I128: + BuildMI(CurrentMBB, DL, TII->get(PPC::ADDC8), ScratchLo) + .addReg(IncrLo) + .addReg(OldLo); + BuildMI(CurrentMBB, DL, TII->get(PPC::ADDE8), ScratchHi) + .addReg(IncrHi) + .addReg(OldHi); + break; + case PPC::ATOMIC_LOAD_SUB_I128: + BuildMI(CurrentMBB, DL, TII->get(PPC::SUBFC8), ScratchLo) + .addReg(IncrLo) + .addReg(OldLo); + BuildMI(CurrentMBB, DL, TII->get(PPC::SUBFE8), ScratchHi) + .addReg(IncrHi) + .addReg(OldHi); + break; + +#define TRIVIAL_ATOMICRMW(Opcode, Instr) \ + case Opcode: \ + BuildMI(CurrentMBB, DL, TII->get((Instr)), ScratchLo) \ + .addReg(IncrLo) \ + .addReg(OldLo); \ + BuildMI(CurrentMBB, DL, TII->get((Instr)), ScratchHi) \ + .addReg(IncrHi) \ + .addReg(OldHi); \ + break + + TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_OR_I128, PPC::OR8); + TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_XOR_I128, PPC::XOR8); + TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_AND_I128, PPC::AND8); + TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_NAND_I128, PPC::NAND8); +#undef TRIVIAL_ATOMICRMW + default: + llvm_unreachable("Unhandled atomic RMW operation"); + } + BuildMI(CurrentMBB, DL, SC).addReg(Scratch).addReg(RA).addReg(RB); + BuildMI(CurrentMBB, DL, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE) + .addReg(PPC::CR0) + .addMBB(LoopMBB); + CurrentMBB->addSuccessor(LoopMBB); + CurrentMBB->addSuccessor(ExitMBB); + recomputeLiveIns(*LoopMBB); + recomputeLiveIns(*ExitMBB); + NMBBI = MBB.end(); + MI.eraseFromParent(); + return true; +} + +bool PPCExpandAtomicPseudo::expandAtomicCmpSwap128( + MachineBasicBlock &MBB, MachineInstr &MI, + MachineBasicBlock::iterator &NMBBI) { + const MCInstrDesc &LL = TII->get(PPC::LQARX); + const MCInstrDesc &SC = TII->get(PPC::STQCX); + DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = MBB.getParent(); + const BasicBlock *BB = MBB.getBasicBlock(); + Register Old = MI.getOperand(0).getReg(); + Register OldHi = TRI->getSubReg(Old, PPC::sub_gp8_x0); + Register OldLo = TRI->getSubReg(Old, PPC::sub_gp8_x1); + Register Scratch = MI.getOperand(1).getReg(); + Register ScratchHi = TRI->getSubReg(Scratch, PPC::sub_gp8_x0); + Register ScratchLo = TRI->getSubReg(Scratch, PPC::sub_gp8_x1); + Register RA = MI.getOperand(2).getReg(); + Register RB = MI.getOperand(3).getReg(); + Register CmpLo = MI.getOperand(4).getReg(); + Register CmpHi = MI.getOperand(5).getReg(); + Register NewLo = MI.getOperand(6).getReg(); + Register NewHi = MI.getOperand(7).getReg(); + // Create layout of control flow. + // loop: + // old = lqarx ptr + // + // bne 0, fail + // succ: + // stqcx new ptr + // bne 0, loop + // b exit + // fail: + // stqcx old ptr + // exit: + // .... + MachineFunction::iterator MFI = ++MBB.getIterator(); + MachineBasicBlock *LoopCmpMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *CmpSuccMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *CmpFailMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(MFI, LoopCmpMBB); + MF->insert(MFI, CmpSuccMBB); + MF->insert(MFI, CmpFailMBB); + MF->insert(MFI, ExitMBB); + ExitMBB->splice(ExitMBB->begin(), &MBB, std::next(MI.getIterator()), + MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopCmpMBB); + // Build loop. + MachineBasicBlock *CurrentMBB = LoopCmpMBB; + BuildMI(CurrentMBB, DL, LL, Old).addReg(RA).addReg(RB); + BuildMI(CurrentMBB, DL, TII->get(PPC::XOR8), ScratchLo) + .addReg(OldLo) + .addReg(CmpLo); + BuildMI(CurrentMBB, DL, TII->get(PPC::XOR8), ScratchHi) + .addReg(OldHi) + .addReg(CmpHi); + BuildMI(CurrentMBB, DL, TII->get(PPC::OR8_rec), ScratchLo) + .addReg(ScratchLo) + .addReg(ScratchHi); + BuildMI(CurrentMBB, DL, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE) + .addReg(PPC::CR0) + .addMBB(CmpFailMBB); + CurrentMBB->addSuccessor(CmpSuccMBB); + CurrentMBB->addSuccessor(CmpFailMBB); + // Build succ. + CurrentMBB = CmpSuccMBB; + PairedCopy(TII, *CurrentMBB, CurrentMBB->end(), DL, ScratchHi, ScratchLo, + NewHi, NewLo); + BuildMI(CurrentMBB, DL, SC).addReg(Scratch).addReg(RA).addReg(RB); + BuildMI(CurrentMBB, DL, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE) + .addReg(PPC::CR0) + .addMBB(LoopCmpMBB); + BuildMI(CurrentMBB, DL, TII->get(PPC::B)).addMBB(ExitMBB); + CurrentMBB->addSuccessor(LoopCmpMBB); + CurrentMBB->addSuccessor(ExitMBB); + CurrentMBB = CmpFailMBB; + BuildMI(CurrentMBB, DL, SC).addReg(Old).addReg(RA).addReg(RB); + CurrentMBB->addSuccessor(ExitMBB); + + recomputeLiveIns(*LoopCmpMBB); + recomputeLiveIns(*CmpSuccMBB); + recomputeLiveIns(*CmpFailMBB); + recomputeLiveIns(*ExitMBB); + NMBBI = MBB.end(); + MI.eraseFromParent(); + return true; +} + +} // namespace + +INITIALIZE_PASS(PPCExpandAtomicPseudo, DEBUG_TYPE, "PowerPC Expand Atomic", + false, false) + +char PPCExpandAtomicPseudo::ID = 0; +FunctionPass *llvm::createPPCExpandAtomicPseudoPass() { + return new PPCExpandAtomicPseudo(); +} diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -876,6 +876,23 @@ Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override; + TargetLowering::AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + + TargetLowering::AtomicExpansionKind + shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + + Value *emitMaskedAtomicRMWIntrinsic(IRBuilderBase &Builder, + AtomicRMWInst *AI, Value *AlignedAddr, + Value *Incr, Value *Mask, + Value *ShiftAmt, + AtomicOrdering Ord) const override; + Value *emitMaskedAtomicCmpXchgIntrinsic(IRBuilderBase &Builder, + AtomicCmpXchgInst *CI, + Value *AlignedAddr, Value *CmpVal, + Value *NewVal, Value *Mask, + AtomicOrdering Ord) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -121,6 +121,11 @@ static cl::opt UseAbsoluteJumpTables("ppc-use-absolute-jumptables", cl::desc("use absolute jump tables on ppc"), cl::Hidden); +static cl::opt EnableQuadwordAtomics( + "ppc-quadword-atomics", + cl::desc("enable quadword lock-free atomic operations"), cl::init(false), + cl::Hidden); + STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM"); @@ -1281,6 +1286,9 @@ setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); } + if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics()) + setMaxAtomicSizeInBitsSupported(128); + setBooleanContents(ZeroOrOneBooleanContent); if (Subtarget.hasAltivec()) { @@ -12628,6 +12636,17 @@ } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 || MI.getOpcode() == PPC::PROBED_ALLOCA_64) { return emitProbedAlloca(MI, BB); + } else if (MI.getOpcode() == PPC::SPLIT_QUADWORD) { + DebugLoc DL = MI.getDebugLoc(); + Register Src = MI.getOperand(2).getReg(); + Register Lo = MI.getOperand(0).getReg(); + Register Hi = MI.getOperand(1).getReg(); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY)) + .addDef(Lo) + .addUse(Src, 0, PPC::sub_gp8_x1); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY)) + .addDef(Hi) + .addUse(Src, 0, PPC::sub_gp8_x0); } else { llvm_unreachable("Unexpected instr type to insert"); } @@ -16042,6 +16061,22 @@ MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { + case Intrinsic::ppc_atomicrmw_xchg_i128: + case Intrinsic::ppc_atomicrmw_add_i128: + case Intrinsic::ppc_atomicrmw_sub_i128: + case Intrinsic::ppc_atomicrmw_nand_i128: + case Intrinsic::ppc_atomicrmw_and_i128: + case Intrinsic::ppc_atomicrmw_or_i128: + case Intrinsic::ppc_atomicrmw_xor_i128: + case Intrinsic::ppc_cmpxchg_i128: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i128; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Align(16); + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile; + return true; case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: @@ -17442,3 +17477,102 @@ return CC_PPC64_ELF_FIS; } } + +TargetLowering::AtomicExpansionKind +PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + if (AI->isFloatingPointOperation()) + return AtomicExpansionKind::None; + unsigned Size = AI->getType()->getPrimitiveSizeInBits(); + if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128) + return AtomicExpansionKind::MaskedIntrinsic; + return AtomicExpansionKind::None; +} + +TargetLowering::AtomicExpansionKind +PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { + unsigned Size = AI->getPointerOperand() + ->getType() + ->getPointerElementType() + ->getPrimitiveSizeInBits(); + if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128) + return AtomicExpansionKind::MaskedIntrinsic; + return AtomicExpansionKind::None; +} + +static Intrinsic::ID +getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) { + switch (BinOp) { + default: + llvm_unreachable("Unexpected AtomicRMW BinOp"); + case AtomicRMWInst::Xchg: + return Intrinsic::ppc_atomicrmw_xchg_i128; + case AtomicRMWInst::Add: + return Intrinsic::ppc_atomicrmw_add_i128; + case AtomicRMWInst::Sub: + return Intrinsic::ppc_atomicrmw_sub_i128; + case AtomicRMWInst::And: + return Intrinsic::ppc_atomicrmw_and_i128; + case AtomicRMWInst::Or: + return Intrinsic::ppc_atomicrmw_or_i128; + case AtomicRMWInst::Xor: + return Intrinsic::ppc_atomicrmw_xor_i128; + case AtomicRMWInst::Nand: + return Intrinsic::ppc_atomicrmw_nand_i128; + } +} + +Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic( + IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr, + Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const { + assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && + "Only support quadword now"); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Type *ValTy = cast(AlignedAddr->getType())->getElementType(); + assert(ValTy->getPrimitiveSizeInBits() == 128); + Function *RMW = Intrinsic::getDeclaration( + M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation())); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo"); + Value *IncrHi = + Builder.CreateTrunc(Builder.CreateLShr(Incr, 64), Int64Ty, "incr_hi"); + Value *Addr = + Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext())); + Value *LoHi = Builder.CreateCall(RMW, {Addr, IncrLo, IncrHi}); + Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); + Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); + Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); + Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); + return Builder.CreateOr( + Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); +} + +Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( + IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr, + Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const { + assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && + "Only support quadword now"); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Type *ValTy = cast(AlignedAddr->getType())->getElementType(); + assert(ValTy->getPrimitiveSizeInBits() == 128); + Function *IntCmpXchg = + Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + Value *CmpLo = Builder.CreateTrunc(CmpVal, Int64Ty, "cmp_lo"); + Value *CmpHi = + Builder.CreateTrunc(Builder.CreateLShr(CmpVal, 64), Int64Ty, "cmp_hi"); + Value *NewLo = Builder.CreateTrunc(NewVal, Int64Ty, "new_lo"); + Value *NewHi = + Builder.CreateTrunc(Builder.CreateLShr(NewVal, 64), Int64Ty, "new_hi"); + Value *Addr = + Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext())); + emitLeadingFence(Builder, CI, Ord); + Value *LoHi = + Builder.CreateCall(IntCmpXchg, {Addr, CmpLo, CmpHi, NewLo, NewHi}); + emitTrailingFence(Builder, CI, Ord); + Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); + Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); + Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); + Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); + return Builder.CreateOr( + Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); +} diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -304,6 +304,88 @@ isPPC64, isRecordForm; } +def SPLIT_QUADWORD : PPCCustomInserterPseudo<(outs g8rc:$lo, g8rc:$hi), + (ins g8prc:$src), + "#SPLIT_QUADWORD", []>; +class AtomicRMW128 + : PPCPostRAExpPseudo<(outs g8prc:$RTp, g8prc:$scratch), + (ins memrr:$ptr, g8rc:$incr_lo, g8rc:$incr_hi), + asmstr, []>; +// We have to keep values in MI's uses during LL/SC looping as they are, +// so set both $RTp and $scratch earlyclobber. +let mayStore = 1, mayLoad = 1, + Defs = [CR0], + Constraints = "@earlyclobber $scratch,@earlyclobber $RTp" in { +// Atomic pseudo instructions expanded post-ra. +def ATOMIC_SWAP_I128 : AtomicRMW128<"#ATOMIC_SWAP_I128">; +def ATOMIC_LOAD_ADD_I128 : AtomicRMW128<"#ATOMIC_LOAD_ADD_I128">; +def ATOMIC_LOAD_SUB_I128 : AtomicRMW128<"#ATOMIC_LOAD_SUB_I128">; +def ATOMIC_LOAD_AND_I128 : AtomicRMW128<"#ATOMIC_LOAD_AND_I128">; +def ATOMIC_LOAD_XOR_I128 : AtomicRMW128<"#ATOMIC_LOAD_XOR_I128">; +def ATOMIC_LOAD_OR_I128 : AtomicRMW128<"#ATOMIC_LOAD_OR_I128">; +def ATOMIC_LOAD_NAND_I128 : AtomicRMW128<"#ATOMIC_LOAD_NAND_I128">; + +def ATOMIC_CMP_SWAP_I128 : PPCPostRAExpPseudo< + (outs g8prc:$RTp, g8prc:$scratch), + (ins memrr:$ptr, g8rc:$cmp_lo, g8rc:$cmp_hi, + g8rc:$new_lo, g8rc:$new_hi), + "#ATOMIC_CMP_SWAP_I128", []>; +} + +def : Pat<(int_ppc_atomicrmw_add_i128 ForceXForm:$ptr, + i64:$incr_lo, + i64:$incr_hi), + (SPLIT_QUADWORD (ATOMIC_LOAD_ADD_I128 memrr:$ptr, + g8rc:$incr_lo, + g8rc:$incr_hi))>; +def : Pat<(int_ppc_atomicrmw_sub_i128 ForceXForm:$ptr, + i64:$incr_lo, + i64:$incr_hi), + (SPLIT_QUADWORD (ATOMIC_LOAD_SUB_I128 memrr:$ptr, + g8rc:$incr_lo, + g8rc:$incr_hi))>; +def : Pat<(int_ppc_atomicrmw_xor_i128 ForceXForm:$ptr, + i64:$incr_lo, + i64:$incr_hi), + (SPLIT_QUADWORD (ATOMIC_LOAD_XOR_I128 memrr:$ptr, + g8rc:$incr_lo, + g8rc:$incr_hi))>; +def : Pat<(int_ppc_atomicrmw_and_i128 ForceXForm:$ptr, + i64:$incr_lo, + i64:$incr_hi), + (SPLIT_QUADWORD (ATOMIC_LOAD_AND_I128 memrr:$ptr, + g8rc:$incr_lo, + g8rc:$incr_hi))>; +def : Pat<(int_ppc_atomicrmw_nand_i128 ForceXForm:$ptr, + i64:$incr_lo, + i64:$incr_hi), + (SPLIT_QUADWORD (ATOMIC_LOAD_NAND_I128 memrr:$ptr, + g8rc:$incr_lo, + g8rc:$incr_hi))>; +def : Pat<(int_ppc_atomicrmw_or_i128 ForceXForm:$ptr, + i64:$incr_lo, + i64:$incr_hi), + (SPLIT_QUADWORD (ATOMIC_LOAD_OR_I128 memrr:$ptr, + g8rc:$incr_lo, + g8rc:$incr_hi))>; +def : Pat<(int_ppc_atomicrmw_xchg_i128 ForceXForm:$ptr, + i64:$incr_lo, + i64:$incr_hi), + (SPLIT_QUADWORD (ATOMIC_SWAP_I128 memrr:$ptr, + g8rc:$incr_lo, + g8rc:$incr_hi))>; +def : Pat<(int_ppc_cmpxchg_i128 ForceXForm:$ptr, + i64:$cmp_lo, + i64:$cmp_hi, + i64:$new_lo, + i64:$new_hi), + (SPLIT_QUADWORD (ATOMIC_CMP_SWAP_I128 + memrr:$ptr, + g8rc:$cmp_lo, + g8rc:$cmp_hi, + g8rc:$new_lo, + g8rc:$new_hi))>; + let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC), "stdat $rS, $rA, $FC", IIC_LdStStore>, isPPC64, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -1170,6 +1170,7 @@ def HasSPE : Predicate<"Subtarget->hasSPE()">; def HasICBT : Predicate<"Subtarget->hasICBT()">; def HasPartwordAtomics : Predicate<"Subtarget->hasPartwordAtomics()">; +def HasQuadwordAtomics : Predicate<"Subtarget->hasQuadwordAtomics()">; def NoNaNsFPMath : Predicate<"Subtarget->getTargetMachine().Options.NoNaNsFPMath">; def NaNsFPMath diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -139,6 +139,7 @@ bool HasICBT; bool HasInvariantFunctionDescriptors; bool HasPartwordAtomics; + bool HasQuadwordAtomics; bool HasDirectMove; bool HasHTM; bool HasFloat128; @@ -301,6 +302,7 @@ bool usePPCPreRASchedStrategy() const { return UsePPCPreRASchedStrategy; } bool usePPCPostRASchedStrategy() const { return UsePPCPostRASchedStrategy; } bool hasPartwordAtomics() const { return HasPartwordAtomics; } + bool hasQuadwordAtomics() const { return HasQuadwordAtomics; } bool hasDirectMove() const { return HasDirectMove; } Align getPlatformStackAlignment() const { diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -123,6 +123,7 @@ initializePPCTLSDynamicCallPass(PR); initializePPCMIPeepholePass(PR); initializePPCLowerMASSVEntriesPass(PR); + initializePPCExpandAtomicPseudoPass(PR); initializeGlobalISel(PR); } @@ -539,6 +540,10 @@ } void PPCPassConfig::addPreEmitPass2() { + // Schedule the expansion of AMOs at the last possible moment, avoiding the + // possibility for other passes to break the requirements for forward + // progress in the LL/SC block. + addPass(createPPCExpandAtomicPseudoPass()); // Must run branch selection immediately preceding the asm printer. addPass(createPPCBranchSelectionPass()); } diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll --- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll @@ -196,6 +196,7 @@ ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis +; CHECK-NEXT: PowerPC Expand Atomic ; CHECK-NEXT: PowerPC Branch Selector ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter diff --git a/llvm/test/CodeGen/PowerPC/atomics-i128.ll b/llvm/test/CodeGen/PowerPC/atomics-i128.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/atomics-i128.ll @@ -0,0 +1,452 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown -mcpu=pwr8 \ +; RUN: -ppc-asm-full-reg-names -ppc-quadword-atomics \ +; RUN: -ppc-track-subreg-liveness < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown -mcpu=pwr7 \ +; RUN: -ppc-asm-full-reg-names -ppc-quadword-atomics \ +; RUN: -ppc-track-subreg-liveness < %s | FileCheck --check-prefix=PWR7 %s + + +define i128 @swap(i128* %a, i128 %x) { +; CHECK-LABEL: swap: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sync +; CHECK-NEXT: .LBB0_1: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: lqarx r6, 0, r3 +; CHECK-NEXT: mr r9, r5 +; CHECK-NEXT: mr r8, r4 +; CHECK-NEXT: stqcx. r8, 0, r3 +; CHECK-NEXT: bne cr0, .LBB0_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: mr r3, r6 +; CHECK-NEXT: mr r4, r7 +; CHECK-NEXT: blr +; +; PWR7-LABEL: swap: +; PWR7: # %bb.0: # %entry +; PWR7-NEXT: mflr r0 +; PWR7-NEXT: std r0, 16(r1) +; PWR7-NEXT: stdu r1, -112(r1) +; PWR7-NEXT: .cfi_def_cfa_offset 112 +; PWR7-NEXT: .cfi_offset lr, 16 +; PWR7-NEXT: sync +; PWR7-NEXT: bl __sync_lock_test_and_set_16 +; PWR7-NEXT: nop +; PWR7-NEXT: lwsync +; PWR7-NEXT: addi r1, r1, 112 +; PWR7-NEXT: ld r0, 16(r1) +; PWR7-NEXT: mtlr r0 +; PWR7-NEXT: blr +entry: + %0 = atomicrmw xchg i128* %a, i128 %x seq_cst, align 16 + ret i128 %0 +} + +define i128 @add(i128* %a, i128 %x) { +; CHECK-LABEL: add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sync +; CHECK-NEXT: .LBB1_1: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: lqarx r6, 0, r3 +; CHECK-NEXT: addc r9, r5, r7 +; CHECK-NEXT: adde r8, r4, r6 +; CHECK-NEXT: stqcx. r8, 0, r3 +; CHECK-NEXT: bne cr0, .LBB1_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: mr r3, r6 +; CHECK-NEXT: mr r4, r7 +; CHECK-NEXT: blr +; +; PWR7-LABEL: add: +; PWR7: # %bb.0: # %entry +; PWR7-NEXT: mflr r0 +; PWR7-NEXT: std r0, 16(r1) +; PWR7-NEXT: stdu r1, -112(r1) +; PWR7-NEXT: .cfi_def_cfa_offset 112 +; PWR7-NEXT: .cfi_offset lr, 16 +; PWR7-NEXT: sync +; PWR7-NEXT: bl __sync_fetch_and_add_16 +; PWR7-NEXT: nop +; PWR7-NEXT: lwsync +; PWR7-NEXT: addi r1, r1, 112 +; PWR7-NEXT: ld r0, 16(r1) +; PWR7-NEXT: mtlr r0 +; PWR7-NEXT: blr +entry: + %0 = atomicrmw add i128* %a, i128 %x seq_cst, align 16 + ret i128 %0 +} + +define i128 @sub(i128* %a, i128 %x) { +; CHECK-LABEL: sub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sync +; CHECK-NEXT: .LBB2_1: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: lqarx r6, 0, r3 +; CHECK-NEXT: subc r9, r7, r5 +; CHECK-NEXT: subfe r8, r4, r6 +; CHECK-NEXT: stqcx. r8, 0, r3 +; CHECK-NEXT: bne cr0, .LBB2_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: mr r3, r6 +; CHECK-NEXT: mr r4, r7 +; CHECK-NEXT: blr +; +; PWR7-LABEL: sub: +; PWR7: # %bb.0: # %entry +; PWR7-NEXT: mflr r0 +; PWR7-NEXT: std r0, 16(r1) +; PWR7-NEXT: stdu r1, -112(r1) +; PWR7-NEXT: .cfi_def_cfa_offset 112 +; PWR7-NEXT: .cfi_offset lr, 16 +; PWR7-NEXT: sync +; PWR7-NEXT: bl __sync_fetch_and_sub_16 +; PWR7-NEXT: nop +; PWR7-NEXT: lwsync +; PWR7-NEXT: addi r1, r1, 112 +; PWR7-NEXT: ld r0, 16(r1) +; PWR7-NEXT: mtlr r0 +; PWR7-NEXT: blr +entry: + %0 = atomicrmw sub i128* %a, i128 %x seq_cst, align 16 + ret i128 %0 +} + +define i128 @and(i128* %a, i128 %x) { +; CHECK-LABEL: and: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sync +; CHECK-NEXT: .LBB3_1: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: lqarx r6, 0, r3 +; CHECK-NEXT: and r9, r5, r7 +; CHECK-NEXT: and r8, r4, r6 +; CHECK-NEXT: stqcx. r8, 0, r3 +; CHECK-NEXT: bne cr0, .LBB3_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: mr r3, r6 +; CHECK-NEXT: mr r4, r7 +; CHECK-NEXT: blr +; +; PWR7-LABEL: and: +; PWR7: # %bb.0: # %entry +; PWR7-NEXT: mflr r0 +; PWR7-NEXT: std r0, 16(r1) +; PWR7-NEXT: stdu r1, -112(r1) +; PWR7-NEXT: .cfi_def_cfa_offset 112 +; PWR7-NEXT: .cfi_offset lr, 16 +; PWR7-NEXT: sync +; PWR7-NEXT: bl __sync_fetch_and_and_16 +; PWR7-NEXT: nop +; PWR7-NEXT: lwsync +; PWR7-NEXT: addi r1, r1, 112 +; PWR7-NEXT: ld r0, 16(r1) +; PWR7-NEXT: mtlr r0 +; PWR7-NEXT: blr +entry: + %0 = atomicrmw and i128* %a, i128 %x seq_cst, align 16 + ret i128 %0 +} + +define i128 @or(i128* %a, i128 %x) { +; CHECK-LABEL: or: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sync +; CHECK-NEXT: .LBB4_1: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: lqarx r6, 0, r3 +; CHECK-NEXT: or r9, r5, r7 +; CHECK-NEXT: or r8, r4, r6 +; CHECK-NEXT: stqcx. r8, 0, r3 +; CHECK-NEXT: bne cr0, .LBB4_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: mr r3, r6 +; CHECK-NEXT: mr r4, r7 +; CHECK-NEXT: blr +; +; PWR7-LABEL: or: +; PWR7: # %bb.0: # %entry +; PWR7-NEXT: mflr r0 +; PWR7-NEXT: std r0, 16(r1) +; PWR7-NEXT: stdu r1, -112(r1) +; PWR7-NEXT: .cfi_def_cfa_offset 112 +; PWR7-NEXT: .cfi_offset lr, 16 +; PWR7-NEXT: sync +; PWR7-NEXT: bl __sync_fetch_and_or_16 +; PWR7-NEXT: nop +; PWR7-NEXT: lwsync +; PWR7-NEXT: addi r1, r1, 112 +; PWR7-NEXT: ld r0, 16(r1) +; PWR7-NEXT: mtlr r0 +; PWR7-NEXT: blr +entry: + %0 = atomicrmw or i128* %a, i128 %x seq_cst, align 16 + ret i128 %0 +} + +define i128 @xor(i128* %a, i128 %x) { +; CHECK-LABEL: xor: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sync +; CHECK-NEXT: .LBB5_1: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: lqarx r6, 0, r3 +; CHECK-NEXT: xor r9, r5, r7 +; CHECK-NEXT: xor r8, r4, r6 +; CHECK-NEXT: stqcx. r8, 0, r3 +; CHECK-NEXT: bne cr0, .LBB5_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: mr r3, r6 +; CHECK-NEXT: mr r4, r7 +; CHECK-NEXT: blr +; +; PWR7-LABEL: xor: +; PWR7: # %bb.0: # %entry +; PWR7-NEXT: mflr r0 +; PWR7-NEXT: std r0, 16(r1) +; PWR7-NEXT: stdu r1, -112(r1) +; PWR7-NEXT: .cfi_def_cfa_offset 112 +; PWR7-NEXT: .cfi_offset lr, 16 +; PWR7-NEXT: sync +; PWR7-NEXT: bl __sync_fetch_and_xor_16 +; PWR7-NEXT: nop +; PWR7-NEXT: lwsync +; PWR7-NEXT: addi r1, r1, 112 +; PWR7-NEXT: ld r0, 16(r1) +; PWR7-NEXT: mtlr r0 +; PWR7-NEXT: blr +entry: + %0 = atomicrmw xor i128* %a, i128 %x seq_cst, align 16 + ret i128 %0 +} + +define i128 @nand(i128* %a, i128 %x) { +; CHECK-LABEL: nand: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sync +; CHECK-NEXT: .LBB6_1: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: lqarx r6, 0, r3 +; CHECK-NEXT: nand r9, r5, r7 +; CHECK-NEXT: nand r8, r4, r6 +; CHECK-NEXT: stqcx. r8, 0, r3 +; CHECK-NEXT: bne cr0, .LBB6_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: mr r3, r6 +; CHECK-NEXT: mr r4, r7 +; CHECK-NEXT: blr +; +; PWR7-LABEL: nand: +; PWR7: # %bb.0: # %entry +; PWR7-NEXT: mflr r0 +; PWR7-NEXT: std r0, 16(r1) +; PWR7-NEXT: stdu r1, -112(r1) +; PWR7-NEXT: .cfi_def_cfa_offset 112 +; PWR7-NEXT: .cfi_offset lr, 16 +; PWR7-NEXT: sync +; PWR7-NEXT: bl __sync_fetch_and_nand_16 +; PWR7-NEXT: nop +; PWR7-NEXT: lwsync +; PWR7-NEXT: addi r1, r1, 112 +; PWR7-NEXT: ld r0, 16(r1) +; PWR7-NEXT: mtlr r0 +; PWR7-NEXT: blr +entry: + %0 = atomicrmw nand i128* %a, i128 %x seq_cst, align 16 + ret i128 %0 +} + +;; CmpXchg +define i128 @cas_weak_acquire_acquire(i128* %a, i128 %cmp, i128 %new) { +; CHECK-LABEL: cas_weak_acquire_acquire: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: .LBB7_1: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: lqarx r8, 0, r3 +; CHECK-NEXT: xor r11, r9, r5 +; CHECK-NEXT: xor r10, r8, r4 +; CHECK-NEXT: or. r11, r11, r10 +; CHECK-NEXT: bne cr0, .LBB7_3 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: mr r11, r7 +; CHECK-NEXT: mr r10, r6 +; CHECK-NEXT: stqcx. r10, 0, r3 +; CHECK-NEXT: bne cr0, .LBB7_1 +; CHECK-NEXT: b .LBB7_4 +; CHECK-NEXT: .LBB7_3: # %entry +; CHECK-NEXT: stqcx. r8, 0, r3 +; CHECK-NEXT: .LBB7_4: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: mr r3, r8 +; CHECK-NEXT: mr r4, r9 +; CHECK-NEXT: blr +; +; PWR7-LABEL: cas_weak_acquire_acquire: +; PWR7: # %bb.0: # %entry +; PWR7-NEXT: mflr r0 +; PWR7-NEXT: std r0, 16(r1) +; PWR7-NEXT: stdu r1, -112(r1) +; PWR7-NEXT: .cfi_def_cfa_offset 112 +; PWR7-NEXT: .cfi_offset lr, 16 +; PWR7-NEXT: bl __sync_val_compare_and_swap_16 +; PWR7-NEXT: nop +; PWR7-NEXT: lwsync +; PWR7-NEXT: addi r1, r1, 112 +; PWR7-NEXT: ld r0, 16(r1) +; PWR7-NEXT: mtlr r0 +; PWR7-NEXT: blr +entry: + %0 = cmpxchg weak i128* %a, i128 %cmp, i128 %new acquire acquire + %1 = extractvalue { i128, i1 } %0, 0 + ret i128 %1 +} + +define i128 @cas_weak_release_monotonic(i128* %a, i128 %cmp, i128 %new) { +; CHECK-LABEL: cas_weak_release_monotonic: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: .LBB8_1: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: lqarx r8, 0, r3 +; CHECK-NEXT: xor r11, r9, r5 +; CHECK-NEXT: xor r10, r8, r4 +; CHECK-NEXT: or. r11, r11, r10 +; CHECK-NEXT: bne cr0, .LBB8_3 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: mr r11, r7 +; CHECK-NEXT: mr r10, r6 +; CHECK-NEXT: stqcx. r10, 0, r3 +; CHECK-NEXT: bne cr0, .LBB8_1 +; CHECK-NEXT: b .LBB8_4 +; CHECK-NEXT: .LBB8_3: # %entry +; CHECK-NEXT: stqcx. r8, 0, r3 +; CHECK-NEXT: .LBB8_4: # %entry +; CHECK-NEXT: mr r3, r8 +; CHECK-NEXT: mr r4, r9 +; CHECK-NEXT: blr +; +; PWR7-LABEL: cas_weak_release_monotonic: +; PWR7: # %bb.0: # %entry +; PWR7-NEXT: mflr r0 +; PWR7-NEXT: std r0, 16(r1) +; PWR7-NEXT: stdu r1, -112(r1) +; PWR7-NEXT: .cfi_def_cfa_offset 112 +; PWR7-NEXT: .cfi_offset lr, 16 +; PWR7-NEXT: lwsync +; PWR7-NEXT: bl __sync_val_compare_and_swap_16 +; PWR7-NEXT: nop +; PWR7-NEXT: addi r1, r1, 112 +; PWR7-NEXT: ld r0, 16(r1) +; PWR7-NEXT: mtlr r0 +; PWR7-NEXT: blr +entry: + %0 = cmpxchg weak i128* %a, i128 %cmp, i128 %new release monotonic + %1 = extractvalue { i128, i1 } %0, 0 + ret i128 %1 +} + +define i128 @cas_sc_sc(i128* %a, i128 %cmp, i128 %new) { +; CHECK-LABEL: cas_sc_sc: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sync +; CHECK-NEXT: .LBB9_1: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: lqarx r8, 0, r3 +; CHECK-NEXT: xor r11, r9, r5 +; CHECK-NEXT: xor r10, r8, r4 +; CHECK-NEXT: or. r11, r11, r10 +; CHECK-NEXT: bne cr0, .LBB9_3 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: mr r11, r7 +; CHECK-NEXT: mr r10, r6 +; CHECK-NEXT: stqcx. r10, 0, r3 +; CHECK-NEXT: bne cr0, .LBB9_1 +; CHECK-NEXT: b .LBB9_4 +; CHECK-NEXT: .LBB9_3: # %entry +; CHECK-NEXT: stqcx. r8, 0, r3 +; CHECK-NEXT: .LBB9_4: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: mr r3, r8 +; CHECK-NEXT: mr r4, r9 +; CHECK-NEXT: blr +; +; PWR7-LABEL: cas_sc_sc: +; PWR7: # %bb.0: # %entry +; PWR7-NEXT: mflr r0 +; PWR7-NEXT: std r0, 16(r1) +; PWR7-NEXT: stdu r1, -112(r1) +; PWR7-NEXT: .cfi_def_cfa_offset 112 +; PWR7-NEXT: .cfi_offset lr, 16 +; PWR7-NEXT: sync +; PWR7-NEXT: bl __sync_val_compare_and_swap_16 +; PWR7-NEXT: nop +; PWR7-NEXT: lwsync +; PWR7-NEXT: addi r1, r1, 112 +; PWR7-NEXT: ld r0, 16(r1) +; PWR7-NEXT: mtlr r0 +; PWR7-NEXT: blr +entry: + %0 = cmpxchg i128* %a, i128 %cmp, i128 %new seq_cst seq_cst + %1 = extractvalue { i128, i1 } %0, 0 + ret i128 %1 +} + +define i128 @cas_acqrel_acquire(i128* %a, i128 %cmp, i128 %new) { +; CHECK-LABEL: cas_acqrel_acquire: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: .LBB10_1: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: lqarx r8, 0, r3 +; CHECK-NEXT: xor r11, r9, r5 +; CHECK-NEXT: xor r10, r8, r4 +; CHECK-NEXT: or. r11, r11, r10 +; CHECK-NEXT: bne cr0, .LBB10_3 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: # +; CHECK-NEXT: mr r11, r7 +; CHECK-NEXT: mr r10, r6 +; CHECK-NEXT: stqcx. r10, 0, r3 +; CHECK-NEXT: bne cr0, .LBB10_1 +; CHECK-NEXT: b .LBB10_4 +; CHECK-NEXT: .LBB10_3: # %entry +; CHECK-NEXT: stqcx. r8, 0, r3 +; CHECK-NEXT: .LBB10_4: # %entry +; CHECK-NEXT: lwsync +; CHECK-NEXT: mr r3, r8 +; CHECK-NEXT: mr r4, r9 +; CHECK-NEXT: blr +; +; PWR7-LABEL: cas_acqrel_acquire: +; PWR7: # %bb.0: # %entry +; PWR7-NEXT: mflr r0 +; PWR7-NEXT: std r0, 16(r1) +; PWR7-NEXT: stdu r1, -112(r1) +; PWR7-NEXT: .cfi_def_cfa_offset 112 +; PWR7-NEXT: .cfi_offset lr, 16 +; PWR7-NEXT: lwsync +; PWR7-NEXT: bl __sync_val_compare_and_swap_16 +; PWR7-NEXT: nop +; PWR7-NEXT: lwsync +; PWR7-NEXT: addi r1, r1, 112 +; PWR7-NEXT: ld r0, 16(r1) +; PWR7-NEXT: mtlr r0 +; PWR7-NEXT: blr +entry: + %0 = cmpxchg i128* %a, i128 %cmp, i128 %new acq_rel acquire + %1 = extractvalue { i128, i1 } %0, 0 + ret i128 %1 +}