diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1741,4 +1741,11 @@ llvm_i64_ty, llvm_i64_ty, llvm_i64_ty, llvm_i64_ty], [IntrArgMemOnly, NoCapture>]>; + def int_ppc_atomic_load_i128 : + Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_ptr_ty], + [IntrArgMemOnly, IntrReadMem, NoCapture>]>; + def int_ppc_atomic_store_i128 : + Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty], + [IntrArgMemOnly, IntrWriteMem, NoCapture>]>; } diff --git a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp --- a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp @@ -102,6 +102,16 @@ return expandAtomicRMW128(MBB, MI, NMBBI); case PPC::ATOMIC_CMP_SWAP_I128: return expandAtomicCmpSwap128(MBB, MI, NMBBI); + case PPC::BUILD_QUADWORD: { + Register Dst = MI.getOperand(0).getReg(); + Register DstHi = TRI->getSubReg(Dst, PPC::sub_gp8_x0); + Register DstLo = TRI->getSubReg(Dst, PPC::sub_gp8_x1); + Register Lo = MI.getOperand(1).getReg(); + Register Hi = MI.getOperand(2).getReg(); + PairedCopy(TII, MBB, MI, MI.getDebugLoc(), DstHi, DstLo, Hi, Lo); + MI.eraseFromParent(); + return true; + } default: return false; } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1246,6 +1246,7 @@ SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1286,8 +1286,12 @@ setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); } - if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics()) + if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics()) { setMaxAtomicSizeInBitsSupported(128); + setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); + setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i128, Custom); + } setBooleanContents(ZeroOrOneBooleanContent); @@ -1518,6 +1522,7 @@ PPC::MOF_NotAddNorCst | PPC::MOF_Vector256 | PPC::MOF_SubtargetP10, PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector256 | PPC::MOF_SubtargetP10, }; + // TODO: Add mapping for quadword load/store. } /// getMaxByValAlign - Helper for getByValTypeAlignment to determine @@ -10452,11 +10457,18 @@ case Intrinsic::ppc_cfence: { assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument."); assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); - return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, - DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, - Op.getOperand(ArgStart + 1)), - Op.getOperand(0)), - 0); + SDValue Val = Op.getOperand(ArgStart + 1); + EVT Ty = Val.getValueType(); + if (Ty == MVT::i128) { + // FIXME: Testing one of two paired registers is sufficient to guarantee + // ordering? + Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, Val); + } + return SDValue( + DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Val), + Op.getOperand(0)), + 0); } default: break; @@ -10519,6 +10531,59 @@ return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO); } +SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op, + SelectionDAG &DAG) const { + AtomicSDNode *N = cast(Op.getNode()); + EVT MemVT = N->getMemoryVT(); + MVT VT = MemVT.getSimpleVT(); + assert(VT == MVT::i128 && "Expect quadword atomic operations"); + SDLoc dl(N); + unsigned Opc = N->getOpcode(); + switch (Opc) { + case ISD::ATOMIC_LOAD: { + // Lower quadword atomic load to int_ppc_atomic_load_i128 which will be + // lowered to ppc instructions by pattern matching instruction selector. + SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other); + SmallVector Ops{ + N->getOperand(0), + DAG.getConstant(Intrinsic::ppc_atomic_load_i128, dl, MVT::i32)}; + for (int I = 1, E = N->getNumOperands(); I < E; ++I) + Ops.push_back(N->getOperand(I)); + SDValue LoadedVal = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys, + Ops, MemVT, N->getMemOperand()); + SDValue ValLo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal); + SDValue ValHi = + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal.getValue(1)); + ValHi = DAG.getNode(ISD::SHL, dl, MVT::i128, ValHi, + DAG.getConstant(64, dl, MVT::i32)); + SDValue Val = + DAG.getNode(ISD::OR, dl, {MVT::i128, MVT::Other}, {ValLo, ValHi}); + return DAG.getNode(ISD::MERGE_VALUES, dl, {MVT::i128, MVT::Other}, + {Val, LoadedVal.getValue(2)}); + } + case ISD::ATOMIC_STORE: { + // Lower quadword atomic store to int_ppc_atomic_store_i128 which will be + // lowered to ppc instructions by pattern matching instruction selector. + SDVTList Tys = DAG.getVTList(MVT::Other); + SmallVector Ops{ + N->getOperand(0), + DAG.getConstant(Intrinsic::ppc_atomic_store_i128, dl, MVT::i32)}; + SDValue Val = N->getOperand(2); + SDValue ValLo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, Val); + SDValue ValHi = DAG.getNode(ISD::SRL, dl, MVT::i128, Val, + DAG.getConstant(64, dl, MVT::i32)); + ValHi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, ValHi); + Ops.push_back(ValLo); + Ops.push_back(ValHi); + Ops.push_back(N->getOperand(1)); + return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, MemVT, + N->getMemOperand()); + } + default: + llvm_unreachable("Unexpected atomic opcode"); + } +} + SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -10910,6 +10975,8 @@ return LowerBSWAP(Op, DAG); case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); + case ISD::ATOMIC_STORE: + return LowerATOMIC_LOAD_STORE(Op, DAG); } } @@ -10920,6 +10987,12 @@ switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case ISD::ATOMIC_LOAD: { + SDValue Res = LowerATOMIC_LOAD_STORE(SDValue(N, 0), DAG); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + break; + } case ISD::READCYCLECOUNTER: { SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0)); @@ -12656,6 +12729,24 @@ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY)) .addDef(Hi) .addUse(Src, 0, PPC::sub_gp8_x0); + } else if (MI.getOpcode() == PPC::LQX_PSEUDO || + MI.getOpcode() == PPC::STQX_PSEUDO) { + DebugLoc DL = MI.getDebugLoc(); + // Ptr is used as the ptr_rc_no_r0 part + // of LQ/STQ's memory operand and adding result of RA and RB, + // so it has to be g8rc_and_g8rc_nox0. + Register Ptr = + F->getRegInfo().createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass); + Register Val = MI.getOperand(0).getReg(); + Register RA = MI.getOperand(1).getReg(); + Register RB = MI.getOperand(2).getReg(); + BuildMI(*BB, MI, DL, TII->get(PPC::ADD8), Ptr).addReg(RA).addReg(RB); + BuildMI(*BB, MI, DL, + MI.getOpcode() == PPC::LQX_PSEUDO ? TII->get(PPC::LQ) + : TII->get(PPC::STQ)) + .addReg(Val, MI.getOpcode() == PPC::LQX_PSEUDO ? RegState::Define : 0) + .addImm(0) + .addReg(Ptr); } else { llvm_unreachable("Unexpected instr type to insert"); } @@ -16091,6 +16182,22 @@ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; + case Intrinsic::ppc_atomic_load_i128: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i128; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Align(16); + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; + return true; + case Intrinsic::ppc_atomic_store_i128: + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::i128; + Info.ptrVal = I.getArgOperand(2); + Info.offset = 0; + Info.align = Align(16); + Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; + return true; case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: @@ -17280,7 +17387,8 @@ EVT MemVT = MN->getMemoryVT(); unsigned Size = MemVT.getSizeInBits(); if (MemVT.isScalarInteger()) { - assert(Size <= 64 && "Not expecting scalar integers larger than 8 bytes!"); + assert(Size <= 128 && + "Not expecting scalar integers larger than 16 bytes!"); if (Size < 32) FlagSet |= PPC::MOF_SubWordInt; else if (Size == 32) diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -1342,12 +1342,25 @@ []>, RegConstraint<"@earlyclobber $RTp">, isPPC64; +// We don't really have LQX in the ISA, make a pseudo one so that we can +// handle x-form during isel. Make it pre-ra may expose +// oppotunities to some opts(CSE, LICM and etc.) for the result of adding +// RA and RB. +def LQX_PSEUDO : PPCCustomInserterPseudo<(outs g8prc:$RTp), + (ins memrr:$src), "#LQX_PSEUDO", []>; + def RESTORE_QUADWORD : PPCEmitTimePseudo<(outs g8prc:$RTp), (ins memrix:$src), "#RESTORE_QUADWORD", []>; } } +def : Pat<(int_ppc_atomic_load_i128 iaddrX16:$src), + (SPLIT_QUADWORD (LQ memrix16:$src))>; + +def : Pat<(int_ppc_atomic_load_i128 ForceXForm:$src), + (SPLIT_QUADWORD (LQX_PSEUDO memrr:$src))>; + // Support for medium and large code model. let hasSideEffects = 0 in { let isReMaterializable = 1 in { @@ -1536,12 +1549,28 @@ def STQ : DSForm_1<62, 2, (outs), (ins g8prc:$RSp, memrix:$dst), "stq $RSp, $dst", IIC_LdStSTQ, []>, isPPC64; + +def STQX_PSEUDO : PPCCustomInserterPseudo<(outs), + (ins g8prc:$RSp, memrr:$dst), + "#STQX_PSEUDO", []>; + def SPILL_QUADWORD : PPCEmitTimePseudo<(outs), (ins g8prc:$RSp, memrix:$dst), "#SPILL_QUADWORD", []>; } } +def BUILD_QUADWORD : PPCPostRAExpPseudo< + (outs g8prc:$RTp), + (ins g8rc:$lo, g8rc:$hi), + "#BUILD_QUADWORD", []>; + +def : Pat<(int_ppc_atomic_store_i128 i64:$lo, i64:$hi, DSForm:$dst), + (STQ (BUILD_QUADWORD g8rc:$lo, g8rc:$hi), memrix:$dst)>; + +def : Pat<(int_ppc_atomic_store_i128 i64:$lo, i64:$hi, ForceXForm:$dst), + (STQX_PSEUDO (BUILD_QUADWORD g8rc:$lo, g8rc:$hi), memrr:$dst)>; + // Stores with Update (pre-inc). let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -3104,6 +3104,7 @@ return true; } + // FIXME: Maybe we can expand it in 'PowerPC Expand Atomic' pass. case PPC::CFENCE8: { auto Val = MI.getOperand(0).getReg(); BuildMI(MBB, MI, DL, get(PPC::CMPD), PPC::CR7).addReg(Val).addReg(Val); diff --git a/llvm/test/CodeGen/PowerPC/atomics-i128-ldst.ll b/llvm/test/CodeGen/PowerPC/atomics-i128-ldst.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/atomics-i128-ldst.ll @@ -0,0 +1,169 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \ +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-track-subreg-liveness \ +; RUN: -ppc-quadword-atomics < %s | FileCheck --check-prefix=P8 %s + +define dso_local i128 @lq_unordered(i128* %src) { +; P8-LABEL: lq_unordered: +; P8: # %bb.0: # %entry +; P8-NEXT: lq r4, 0(r3) +; P8-NEXT: mr r3, r4 +; P8-NEXT: mr r4, r5 +; P8-NEXT: blr +entry: + %0 = load atomic i128, i128* %src unordered, align 16 + ret i128 %0 +} + +define dso_local i128 @lqx_unordered(i128* %src, i64 %idx) { +; P8-LABEL: lqx_unordered: +; P8: # %bb.0: # %entry +; P8-NEXT: sldi r4, r4, 4 +; P8-NEXT: add r3, r3, r4 +; P8-NEXT: lq r4, 0(r3) +; P8-NEXT: mr r3, r4 +; P8-NEXT: mr r4, r5 +; P8-NEXT: blr +entry: + %0 = getelementptr i128, i128* %src, i64 %idx + %1 = load atomic i128, i128* %0 unordered, align 16 + ret i128 %1 +} + +define dso_local i128 @lq_big_offset_unordered(i128* %src) { +; P8-LABEL: lq_big_offset_unordered: +; P8: # %bb.0: # %entry +; P8-NEXT: lis r4, 32 +; P8-NEXT: add r3, r3, r4 +; P8-NEXT: lq r4, 0(r3) +; P8-NEXT: mr r3, r4 +; P8-NEXT: mr r4, r5 +; P8-NEXT: blr +entry: + %0 = getelementptr i128, i128* %src, i64 131072 + %1 = load atomic i128, i128* %0 unordered, align 16 + ret i128 %1 +} + +define dso_local i128 @lq_monotonic(i128* %src) { +; P8-LABEL: lq_monotonic: +; P8: # %bb.0: # %entry +; P8-NEXT: lq r4, 0(r3) +; P8-NEXT: mr r3, r4 +; P8-NEXT: mr r4, r5 +; P8-NEXT: blr +entry: + %0 = load atomic i128, i128* %src monotonic, align 16 + ret i128 %0 +} + +define dso_local i128 @lq_acquire(i128* %src) { +; P8-LABEL: lq_acquire: +; P8: # %bb.0: # %entry +; P8-NEXT: lq r4, 0(r3) +; P8-NEXT: cmpd cr7, r5, r5 +; P8-NEXT: mr r3, r4 +; P8-NEXT: mr r4, r5 +; P8-NEXT: bne- cr7, .+4 +; P8-NEXT: isync +; P8-NEXT: blr +entry: + %0 = load atomic i128, i128* %src acquire, align 16 + ret i128 %0 +} + +define dso_local i128 @lq_seqcst(i128* %src) { +; P8-LABEL: lq_seqcst: +; P8: # %bb.0: # %entry +; P8-NEXT: sync +; P8-NEXT: lq r4, 0(r3) +; P8-NEXT: cmpd cr7, r5, r5 +; P8-NEXT: mr r3, r4 +; P8-NEXT: mr r4, r5 +; P8-NEXT: bne- cr7, .+4 +; P8-NEXT: isync +; P8-NEXT: blr +entry: + %0 = load atomic i128, i128* %src seq_cst, align 16 + ret i128 %0 +} + +define dso_local void @stq_unordered(i128 %val, i128* %dst) { +; P8-LABEL: stq_unordered: +; P8: # %bb.0: # %entry +; P8-NEXT: mr r7, r4 +; P8-NEXT: mr r6, r3 +; P8-NEXT: stq r6, 0(r5) +; P8-NEXT: blr +entry: + store atomic i128 %val, i128* %dst unordered, align 16 + ret void +} + +define dso_local void @stqx_unordered(i128 %val, i128* %dst, i64 %idx) { +; P8-LABEL: stqx_unordered: +; P8: # %bb.0: # %entry +; P8-NEXT: sldi r6, r6, 4 +; P8-NEXT: mr r9, r4 +; P8-NEXT: mr r8, r3 +; P8-NEXT: add r3, r5, r6 +; P8-NEXT: stq r8, 0(r3) +; P8-NEXT: blr +entry: + %0 = getelementptr i128, i128* %dst, i64 %idx + store atomic i128 %val, i128* %0 unordered, align 16 + ret void +} + +define dso_local void @stq_big_offset_unordered(i128 %val, i128* %dst) { +; P8-LABEL: stq_big_offset_unordered: +; P8: # %bb.0: # %entry +; P8-NEXT: lis r6, 32 +; P8-NEXT: mr r9, r4 +; P8-NEXT: mr r8, r3 +; P8-NEXT: add r3, r5, r6 +; P8-NEXT: stq r8, 0(r3) +; P8-NEXT: blr +entry: + %0 = getelementptr i128, i128* %dst, i64 131072 + store atomic i128 %val, i128* %0 unordered, align 16 + ret void +} + +define dso_local void @stq_monotonic(i128 %val, i128* %dst) { +; P8-LABEL: stq_monotonic: +; P8: # %bb.0: # %entry +; P8-NEXT: mr r7, r4 +; P8-NEXT: mr r6, r3 +; P8-NEXT: stq r6, 0(r5) +; P8-NEXT: blr +entry: + store atomic i128 %val, i128* %dst monotonic, align 16 + ret void +} + +define dso_local void @stq_release(i128 %val, i128* %dst) { +; P8-LABEL: stq_release: +; P8: # %bb.0: # %entry +; P8-NEXT: lwsync +; P8-NEXT: mr r7, r4 +; P8-NEXT: mr r6, r3 +; P8-NEXT: stq r6, 0(r5) +; P8-NEXT: blr +entry: + store atomic i128 %val, i128* %dst release, align 16 + ret void +} + +define dso_local void @stq_seqcst(i128 %val, i128* %dst) { +; P8-LABEL: stq_seqcst: +; P8: # %bb.0: # %entry +; P8-NEXT: sync +; P8-NEXT: mr r7, r4 +; P8-NEXT: mr r6, r3 +; P8-NEXT: stq r6, 0(r5) +; P8-NEXT: blr +entry: + store atomic i128 %val, i128* %dst seq_cst, align 16 + ret void +}