diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1501,3 +1501,13 @@ [llvm_v512i1_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; } + +//===----------------------------------------------------------------------===// +// PowerPC Atomic Intrinsic Definitions. +let TargetPrefix = "ppc" in { + def int_ppc_lqarx : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty], + [IntrNoFree, IntrWillReturn]>; + def int_ppc_stqcx : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty], + [IntrNoFree, IntrWillReturn]>; +} diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -161,6 +161,9 @@ def FeaturePartwordAtomic : SubtargetFeature<"partword-atomics", "HasPartwordAtomics", "true", "Enable l[bh]arx and st[bh]cx.">; +def FeatureQuadwordAtomic : SubtargetFeature<"quadword-atomics", + "HasQuadwordAtomics", "true", + "Enable lqarx and stqcx.">; def FeatureInvariantFunctionDescriptors : SubtargetFeature<"invariant-function-descriptors", "HasInvariantFunctionDescriptors", "true", @@ -327,6 +330,7 @@ FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic, + FeatureQuadwordAtomic, FeaturePredictableSelectIsExpensive ]; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -573,6 +573,9 @@ ATOMIC_CMP_SWAP_8, ATOMIC_CMP_SWAP_16, + /// STQCX - Store conditional for quadword atomic operations. + STQCX, + /// GPRC = TOC_ENTRY GA, TOC /// Loads the entry for GA from the TOC, where the TOC base is given by /// the last operand. @@ -871,6 +874,15 @@ return true; } + TargetLowering::AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + + Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, + AtomicOrdering Ord) const override; + + Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, + AtomicOrdering Ord) const override; + Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const override; Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, @@ -1333,6 +1345,7 @@ SelectionDAG &DAG) const; SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase, DAGCombinerInfo &DCI) const; + SDValue combineStoreConditional(SDNode *N, DAGCombinerInfo &DCI) const; /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces /// SETCC with integer subtraction when (1) there is a legal way of doing it diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1603,6 +1603,7 @@ case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8"; case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16"; + case PPCISD::STQCX: return "PPCISD:STQCX"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; case PPCISD::PROBED_ALLOCA: return "PPCISD::PROBED_ALLOCA"; @@ -12592,6 +12593,17 @@ } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 || MI.getOpcode() == PPC::PROBED_ALLOCA_64) { return emitProbedAlloca(MI, BB); + } else if (MI.getOpcode() == PPC::SPLIT_QUADWORD) { + DebugLoc DL = MI.getDebugLoc(); + Register Src = MI.getOperand(2).getReg(); + Register Hi = MI.getOperand(0).getReg(); + Register Lo = MI.getOperand(1).getReg(); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY)) + .addDef(Hi) + .addUse(Src, 0, PPC::sub_gp8_x0); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY)) + .addDef(Lo) + .addUse(Src, 0, PPC::sub_gp8_x1); } else { llvm_unreachable("Unexpected instr type to insert"); } @@ -15280,6 +15292,10 @@ } break; case ISD::BR_CC: { + // If this is a branch on an store conditional comparison, lower this to + // branch directly on CR0. + if (SDValue SCBr = combineStoreConditional(N, DCI)) + return SCBr; // If this is a branch on an altivec predicate comparison, lower this so // that we don't have to do a MFOCRF: instead, branch directly on CR6. This // lowering is done pre-legalize, because the legalizer lowers the predicate @@ -15966,6 +15982,22 @@ MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { + case Intrinsic::ppc_lqarx: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i128; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Align(16); + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; + return true; + case Intrinsic::ppc_stqcx: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i128; + Info.ptrVal = I.getArgOperand(2); + Info.offset = 0; + Info.align = Align(16); + Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; + return true; case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: @@ -17346,3 +17378,84 @@ } return Mode; } + +TargetLowering::AtomicExpansionKind +PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + if (AI->isFloatingPointOperation()) + return AtomicExpansionKind::None; + unsigned Size = AI->getType()->getPrimitiveSizeInBits(); + if (Subtarget.isAIXABI() && Subtarget.hasQuadwordAtomics() && Size == 128) + return AtomicExpansionKind::LLSC; + return AtomicExpansionKind::None; +} + +Value *PPCTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, + AtomicOrdering Ord) const { + assert(Subtarget.hasQuadwordAtomics() && "Only support quadword now"); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Type *ValTy = cast(Addr->getType())->getElementType(); + assert(ValTy->getPrimitiveSizeInBits() == 128); + Function *Lqarx = Intrinsic::getDeclaration(M, Intrinsic::ppc_lqarx); + Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); + Value *HiLo = Builder.CreateCall(Lqarx, Addr, "hilo"); + // Follow big-endian convention. + Value *Hi = Builder.CreateExtractValue(HiLo, 0, "hi"); + Value *Lo = Builder.CreateExtractValue(HiLo, 1, "lo"); + Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); + Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); + return Builder.CreateOr( + Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); +} + +Value *PPCTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, + Value *Addr, + AtomicOrdering Ord) const { + assert(Subtarget.hasQuadwordAtomics() && "Only support quadword now"); + assert(Val->getType()->getPrimitiveSizeInBits() == 128); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *Stqcx = Intrinsic::getDeclaration(M, Intrinsic::ppc_stqcx); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); + Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); + Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); + return Builder.CreateCall(Stqcx, {Hi, Lo, Addr}); +} + +SDValue PPCTargetLowering::combineStoreConditional(SDNode *N, + DAGCombinerInfo &DCI) const { + assert(N->getOpcode() == ISD::BR_CC && "Expect a branch node"); + SelectionDAG &DAG = DCI.DAG; + ISD::CondCode CC = cast(N->getOperand(1))->get(); + SDValue LHS = N->getOperand(2), RHS = N->getOperand(3); + if (LHS.getOpcode() != ISD::INTRINSIC_W_CHAIN || CC != ISD::SETNE || + !LHS.hasOneUse() || !isa(RHS)) + return SDValue(); + unsigned Val = cast(RHS)->getZExtValue(); + if (Val) + return SDValue(); + SDLoc DL(N); + SDNode *Node = cast(LHS.getNode()); + SDValue Chain = Node->getOperand(0); + unsigned IntrinsicID = + cast(Node->getOperand(1))->getZExtValue(); + switch (IntrinsicID) { + default: + return SDValue(); + case Intrinsic::ppc_stqcx: { + MachineMemOperand *MMO = cast(Node)->getMemOperand(); + SDValue Ops[] = {Chain, Node->getOperand(2), Node->getOperand(3), + Node->getOperand(4)}; + SDValue SC = DAG.getMemIntrinsicNode( + PPCISD::STQCX, DL, DAG.getVTList(MVT::i32, MVT::Other, MVT::Glue), Ops, + MVT::i128, MMO); + SDValue CR0 = DAG.getRegister(PPC::CR0, MVT::i32); + DAG.ReplaceAllUsesOfValueWith(LHS, CR0); + DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), Chain); + SDValue Br = + DAG.getNode(PPCISD::COND_BRANCH, DL, MVT::Other, N->getOperand(0), + DAG.getConstant(PPC::PRED_NE, DL, MVT::i32), CR0, + N->getOperand(4), SC.getValue(2)); + return Br; + } + } +} diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -291,6 +291,12 @@ Requires<[IsISA3_0]>; } +def SPLIT_QUADWORD : PPCCustomInserterPseudo<(outs g8rc:$hi, g8rc:$lo), + (ins g8prc:$src), + "#SPLIT_QUADWORD", []>; +def : Pat<(int_ppc_lqarx ForceXForm:$src), + (SPLIT_QUADWORD (LQARX memrr:$src))>; + let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in { def STDCX : XForm_1_memOp<31, 214, (outs), (ins g8rc:$rS, memrr:$dst), "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isRecordForm; @@ -300,6 +306,13 @@ "stqcx. $RSp, $dst", IIC_LdStSTQCX, []>, isRecordForm; } +def BUILD_QUADWORD : PPCPostRAExpPseudo<(outs g8prc:$RTp), + (ins g8rc:$src0, g8rc:$src1), + "#BUILD_QUADWORD", []>; + +def : Pat<(PPCstqcx i64:$hi, i64:$lo, ForceXForm:$dst), + (STQCX (i128 (BUILD_QUADWORD g8rc:$hi, g8rc:$lo)), memrr:$dst)>; + let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC), "stdat $rS, $rA, $FC", IIC_LdStStore>, isPPC64, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -3114,6 +3114,39 @@ MI.RemoveOperand(0); return true; } + case PPC::BUILD_QUADWORD: { + unsigned DestIdx = MI.getOperand(0).getReg() - PPC::G8p0; + Register DestX0 = PPC::X0 + 2 * DestIdx; + Register DestX1 = DestX0 + 1; + // We have to check we don't early clobber Src registers. + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + // The most tricky case, swapping values. + if (DestX0 == Src1 && DestX1 == Src0) { + BuildMI(MBB, MI, DL, get(PPC::XOR8), DestX0) + .addReg(DestX0) + .addReg(DestX1); + BuildMI(MBB, MI, DL, get(PPC::XOR8), DestX1) + .addReg(DestX0) + .addReg(DestX1); + BuildMI(MBB, MI, DL, get(PPC::XOR8), DestX0) + .addReg(DestX0) + .addReg(DestX1); + } else if (DestX0 != Src0 || DestX1 != Src1) { + if (DestX0 == Src1 || DestX1 != Src0) { + BuildMI(MBB, MI, DL, get(PPC::OR8), DestX1).addReg(Src1).addReg(Src1); + BuildMI(MBB, MI, DL, get(PPC::OR8), DestX0).addReg(Src0).addReg(Src0); + } else { + BuildMI(MBB, MI, DL, get(PPC::OR8), DestX0).addReg(Src0).addReg(Src0); + BuildMI(MBB, MI, DL, get(PPC::OR8), DestX1).addReg(Src1).addReg(Src1); + } + } + MI.setDesc(get(PPC::UNENCODED_NOP)); + MI.RemoveOperand(2); + MI.RemoveOperand(1); + MI.RemoveOperand(0); + return true; + } } return false; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -83,6 +83,9 @@ def SDT_PPCstbrx : SDTypeProfile<0, 3, [ SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT> ]>; +def SDT_PPCstqcx : SDTypeProfile<1, 3, [ + SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisPtrTy<3> +]>; def SDT_PPCTC_ret : SDTypeProfile<0, 2, [ SDTCisPtrTy<0>, SDTCisVT<1, i32> @@ -357,6 +360,8 @@ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx, [SDNPHasChain, SDNPMayStore]>; +def PPCstqcx : SDNode<"PPCISD::STQCX", SDT_PPCstqcx, + [SDNPMayStore, SDNPOutGlue, SDNPHasChain]>; // Instructions to set/unset CR bit 6 for SVR4 vararg calls def PPCcr6set : SDNode<"PPCISD::CR6SET", SDTNone, @@ -1170,6 +1175,7 @@ def HasSPE : Predicate<"Subtarget->hasSPE()">; def HasICBT : Predicate<"Subtarget->hasICBT()">; def HasPartwordAtomics : Predicate<"Subtarget->hasPartwordAtomics()">; +def HasQuadwordAtomics : Predicate<"Subtarget->hasQuadwordAtomics()">; def NoNaNsFPMath : Predicate<"Subtarget->getTargetMachine().Options.NoNaNsFPMath">; def NaNsFPMath diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -139,6 +139,7 @@ bool HasICBT; bool HasInvariantFunctionDescriptors; bool HasPartwordAtomics; + bool HasQuadwordAtomics; bool HasDirectMove; bool HasHTM; bool HasFloat128; @@ -301,6 +302,7 @@ bool usePPCPreRASchedStrategy() const { return UsePPCPreRASchedStrategy; } bool usePPCPostRASchedStrategy() const { return UsePPCPostRASchedStrategy; } bool hasPartwordAtomics() const { return HasPartwordAtomics; } + bool hasQuadwordAtomics() const { return HasQuadwordAtomics; } bool hasDirectMove() const { return HasDirectMove; } Align getPlatformStackAlignment() const { diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -119,6 +119,7 @@ HasICBT = false; HasInvariantFunctionDescriptors = false; HasPartwordAtomics = false; + HasQuadwordAtomics = false; HasDirectMove = false; HasHTM = false; HasFloat128 = false; diff --git a/llvm/test/CodeGen/PowerPC/ppc64-atomic-128.ll b/llvm/test/CodeGen/PowerPC/ppc64-atomic-128.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/ppc64-atomic-128.ll @@ -0,0 +1,266 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix-xcoff -mcpu=pwr8 \ +; RUN: -ppc-track-subreg-liveness < %s | FileCheck %s + +@var = global i128 0 + +define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) { +; CHECK-LABEL: val_compare_and_swap: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr 0 +; CHECK-NEXT: std 0, 16(1) +; CHECK-NEXT: stdu 1, -112(1) +; CHECK-NEXT: bl .__sync_val_compare_and_swap_16[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: lwsync +; CHECK-NEXT: addi 1, 1, 112 +; CHECK-NEXT: ld 0, 16(1) +; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: blr + %pair = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire + %val = extractvalue { i128, i1 } %pair, 0 + ret i128 %val +} + +define void @fetch_and_nand(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_nand: +; CHECK: # %bb.0: +; CHECK-NEXT: lwsync +; CHECK-NEXT: .align 4 +; CHECK-NEXT: L..BB1_1: # %atomicrmw.start +; CHECK-NEXT: # +; CHECK-NEXT: lqarx 6, 0, 3 +; CHECK-NEXT: ori 7, 7, 0 +; CHECK-NEXT: nand 8, 6, 4 +; CHECK-NEXT: nand 10, 7, 5 +; CHECK-NEXT: ori 9, 8, 0 +; CHECK-NEXT: or 8, 9, 8 +; CHECK-NEXT: mr 9, 10 +; CHECK-NEXT: stqcx. 8, 0, 3 +; CHECK-NEXT: bne 0, L..BB1_1 +; CHECK-NEXT: # %bb.2: # %atomicrmw.end +; CHECK-NEXT: ld 3, L..C0(2) # @var +; CHECK-NEXT: std 6, 0(3) +; CHECK-NEXT: std 7, 8(3) +; CHECK-NEXT: blr + %val = atomicrmw nand i128* %p, i128 %bits release + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_or(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_or: +; CHECK: # %bb.0: +; CHECK-NEXT: sync +; CHECK-NEXT: .align 4 +; CHECK-NEXT: L..BB2_1: # %atomicrmw.start +; CHECK-NEXT: # +; CHECK-NEXT: lqarx 6, 0, 3 +; CHECK-NEXT: ori 7, 7, 0 +; CHECK-NEXT: or 8, 6, 4 +; CHECK-NEXT: or 10, 7, 5 +; CHECK-NEXT: ori 9, 8, 0 +; CHECK-NEXT: or 8, 9, 8 +; CHECK-NEXT: mr 9, 10 +; CHECK-NEXT: stqcx. 8, 0, 3 +; CHECK-NEXT: bne 0, L..BB2_1 +; CHECK-NEXT: # %bb.2: # %atomicrmw.end +; CHECK-NEXT: ld 3, L..C0(2) # @var +; CHECK-NEXT: lwsync +; CHECK-NEXT: std 6, 0(3) +; CHECK-NEXT: std 7, 8(3) +; CHECK-NEXT: blr + %val = atomicrmw or i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_add(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_add: +; CHECK: # %bb.0: +; CHECK-NEXT: sync +; CHECK-NEXT: .align 4 +; CHECK-NEXT: L..BB3_1: # %atomicrmw.start +; CHECK-NEXT: # +; CHECK-NEXT: lqarx 8, 0, 3 +; CHECK-NEXT: ori 6, 9, 0 +; CHECK-NEXT: mr 7, 8 +; CHECK-NEXT: addc 8, 6, 5 +; CHECK-NEXT: adde 9, 7, 4 +; CHECK-NEXT: ori 10, 9, 0 +; CHECK-NEXT: or 9, 10, 9 +; CHECK-NEXT: xor 8, 8, 9 +; CHECK-NEXT: xor 9, 8, 9 +; CHECK-NEXT: xor 8, 8, 9 +; CHECK-NEXT: stqcx. 8, 0, 3 +; CHECK-NEXT: bne 0, L..BB3_1 +; CHECK-NEXT: # %bb.2: # %atomicrmw.end +; CHECK-NEXT: ld 3, L..C0(2) # @var +; CHECK-NEXT: lwsync +; CHECK-NEXT: std 7, 0(3) +; CHECK-NEXT: std 6, 8(3) +; CHECK-NEXT: blr + %val = atomicrmw add i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_sub(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_sub: +; CHECK: # %bb.0: +; CHECK-NEXT: sync +; CHECK-NEXT: .align 4 +; CHECK-NEXT: L..BB4_1: # %atomicrmw.start +; CHECK-NEXT: # +; CHECK-NEXT: lqarx 8, 0, 3 +; CHECK-NEXT: ori 6, 9, 0 +; CHECK-NEXT: mr 7, 8 +; CHECK-NEXT: subc 8, 6, 5 +; CHECK-NEXT: subfe 9, 4, 7 +; CHECK-NEXT: ori 10, 9, 0 +; CHECK-NEXT: or 9, 10, 9 +; CHECK-NEXT: xor 8, 8, 9 +; CHECK-NEXT: xor 9, 8, 9 +; CHECK-NEXT: xor 8, 8, 9 +; CHECK-NEXT: stqcx. 8, 0, 3 +; CHECK-NEXT: bne 0, L..BB4_1 +; CHECK-NEXT: # %bb.2: # %atomicrmw.end +; CHECK-NEXT: ld 3, L..C0(2) # @var +; CHECK-NEXT: lwsync +; CHECK-NEXT: std 7, 0(3) +; CHECK-NEXT: std 6, 8(3) +; CHECK-NEXT: blr + %val = atomicrmw sub i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_min(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_min: +; CHECK: # %bb.0: +; CHECK-NEXT: sync +; CHECK-NEXT: .align 4 +; CHECK-NEXT: L..BB5_1: # %atomicrmw.start +; CHECK-NEXT: # +; CHECK-NEXT: lqarx 6, 0, 3 +; CHECK-NEXT: ori 7, 7, 0 +; CHECK-NEXT: cmpld 6, 4 +; CHECK-NEXT: cmpd 1, 6, 4 +; CHECK-NEXT: crnor 20, 5, 2 +; CHECK-NEXT: cmpld 1, 7, 5 +; CHECK-NEXT: crandc 21, 2, 5 +; CHECK-NEXT: cror 20, 21, 20 +; CHECK-NEXT: isel 8, 6, 4, 20 +; CHECK-NEXT: ori 9, 8, 0 +; CHECK-NEXT: isel 10, 7, 5, 20 +; CHECK-NEXT: or 8, 9, 8 +; CHECK-NEXT: mr 9, 10 +; CHECK-NEXT: stqcx. 8, 0, 3 +; CHECK-NEXT: bne 0, L..BB5_1 +; CHECK-NEXT: # %bb.2: # %atomicrmw.end +; CHECK-NEXT: ld 3, L..C0(2) # @var +; CHECK-NEXT: lwsync +; CHECK-NEXT: std 6, 0(3) +; CHECK-NEXT: std 7, 8(3) +; CHECK-NEXT: blr + %val = atomicrmw min i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_max(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_max: +; CHECK: # %bb.0: +; CHECK-NEXT: sync +; CHECK-NEXT: .align 4 +; CHECK-NEXT: L..BB6_1: # %atomicrmw.start +; CHECK-NEXT: # +; CHECK-NEXT: lqarx 6, 0, 3 +; CHECK-NEXT: ori 7, 7, 0 +; CHECK-NEXT: cmpld 6, 4 +; CHECK-NEXT: cmpd 1, 6, 4 +; CHECK-NEXT: crandc 20, 5, 2 +; CHECK-NEXT: cmpld 1, 7, 5 +; CHECK-NEXT: crand 21, 2, 5 +; CHECK-NEXT: cror 20, 21, 20 +; CHECK-NEXT: isel 8, 6, 4, 20 +; CHECK-NEXT: ori 9, 8, 0 +; CHECK-NEXT: isel 10, 7, 5, 20 +; CHECK-NEXT: or 8, 9, 8 +; CHECK-NEXT: mr 9, 10 +; CHECK-NEXT: stqcx. 8, 0, 3 +; CHECK-NEXT: bne 0, L..BB6_1 +; CHECK-NEXT: # %bb.2: # %atomicrmw.end +; CHECK-NEXT: ld 3, L..C0(2) # @var +; CHECK-NEXT: lwsync +; CHECK-NEXT: std 6, 0(3) +; CHECK-NEXT: std 7, 8(3) +; CHECK-NEXT: blr + %val = atomicrmw max i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_umin(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_umin: +; CHECK: # %bb.0: +; CHECK-NEXT: sync +; CHECK-NEXT: .align 4 +; CHECK-NEXT: L..BB7_1: # %atomicrmw.start +; CHECK-NEXT: # +; CHECK-NEXT: lqarx 6, 0, 3 +; CHECK-NEXT: ori 7, 7, 0 +; CHECK-NEXT: cmpld 6, 4 +; CHECK-NEXT: cmpld 1, 7, 5 +; CHECK-NEXT: crnor 20, 1, 2 +; CHECK-NEXT: crandc 21, 2, 5 +; CHECK-NEXT: cror 20, 21, 20 +; CHECK-NEXT: isel 8, 6, 4, 20 +; CHECK-NEXT: ori 9, 8, 0 +; CHECK-NEXT: isel 10, 7, 5, 20 +; CHECK-NEXT: or 8, 9, 8 +; CHECK-NEXT: mr 9, 10 +; CHECK-NEXT: stqcx. 8, 0, 3 +; CHECK-NEXT: bne 0, L..BB7_1 +; CHECK-NEXT: # %bb.2: # %atomicrmw.end +; CHECK-NEXT: ld 3, L..C0(2) # @var +; CHECK-NEXT: lwsync +; CHECK-NEXT: std 6, 0(3) +; CHECK-NEXT: std 7, 8(3) +; CHECK-NEXT: blr + %val = atomicrmw umin i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_umax(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_umax: +; CHECK: # %bb.0: +; CHECK-NEXT: sync +; CHECK-NEXT: .align 4 +; CHECK-NEXT: L..BB8_1: # %atomicrmw.start +; CHECK-NEXT: # +; CHECK-NEXT: lqarx 6, 0, 3 +; CHECK-NEXT: ori 7, 7, 0 +; CHECK-NEXT: cmpld 6, 4 +; CHECK-NEXT: cmpld 1, 7, 5 +; CHECK-NEXT: crandc 20, 1, 2 +; CHECK-NEXT: crand 21, 2, 5 +; CHECK-NEXT: cror 20, 21, 20 +; CHECK-NEXT: isel 8, 6, 4, 20 +; CHECK-NEXT: ori 9, 8, 0 +; CHECK-NEXT: isel 10, 7, 5, 20 +; CHECK-NEXT: or 8, 9, 8 +; CHECK-NEXT: mr 9, 10 +; CHECK-NEXT: stqcx. 8, 0, 3 +; CHECK-NEXT: bne 0, L..BB8_1 +; CHECK-NEXT: # %bb.2: # %atomicrmw.end +; CHECK-NEXT: ld 3, L..C0(2) # @var +; CHECK-NEXT: lwsync +; CHECK-NEXT: std 6, 0(3) +; CHECK-NEXT: std 7, 8(3) +; CHECK-NEXT: blr + %val = atomicrmw umax i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +}