diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -475,8 +475,10 @@ STZ2G, LDP, + LDIAPP, LDNP, STP, + STILP, STNP, // Memory Operations @@ -710,6 +712,7 @@ void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override; bool isOpSuitableForLDPSTP(const Instruction *I) const; + bool isOpSuitableForRCPC3(const Instruction *I) const; bool shouldInsertFencesForAtomic(const Instruction *I) const override; TargetLoweringBase::AtomicExpansionKind diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -805,8 +805,8 @@ setOperationAction(ISD::STORE, MVT::i128, Custom); // Aligned 128-bit loads and stores are single-copy atomic according to the - // v8.4a spec. - if (Subtarget->hasLSE2()) { + // v8.4a spec. FEAT_LRCPC3 introduces 128-bit STILP/LDIAPP. + if (Subtarget->hasLSE2() || Subtarget->hasRCPC3()) { setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); } @@ -2543,8 +2543,10 @@ MAKE_CASE(AArch64ISD::SSTNT1_PRED) MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) MAKE_CASE(AArch64ISD::LDP) + MAKE_CASE(AArch64ISD::LDIAPP) MAKE_CASE(AArch64ISD::LDNP) MAKE_CASE(AArch64ISD::STP) + MAKE_CASE(AArch64ISD::STILP) MAKE_CASE(AArch64ISD::STNP) MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) @@ -5763,9 +5765,15 @@ MemSDNode *StoreNode = cast(Op); assert(StoreNode->getMemoryVT() == MVT::i128); assert(StoreNode->isVolatile() || StoreNode->isAtomic()); - assert(!StoreNode->isAtomic() || - StoreNode->getMergedOrdering() == AtomicOrdering::Unordered || - StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic); + + bool isStoreRelease = + StoreNode->getMergedOrdering() == AtomicOrdering::Release; + if (StoreNode->isAtomic()) + assert((Subtarget->hasFeature(AArch64::FeatureRCPC3) && isStoreRelease) || + StoreNode->getMergedOrdering() == AtomicOrdering::Unordered || + StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic); + + // FIXME RCPC3 not actually implemented, will lower to STP SDValue Value = StoreNode->getOpcode() == ISD::STORE ? StoreNode->getOperand(1) @@ -5775,8 +5783,10 @@ DAG.getConstant(0, DL, MVT::i64)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value, DAG.getConstant(1, DL, MVT::i64)); + + unsigned Opcode = isStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP; SDValue Result = DAG.getMemIntrinsicNode( - AArch64ISD::STP, DL, DAG.getVTList(MVT::Other), + Opcode, DL, DAG.getVTList(MVT::Other), {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, StoreNode->getMemoryVT(), StoreNode->getMemOperand()); return Result; @@ -6049,7 +6059,7 @@ return LowerINTRINSIC_VOID(Op, DAG); case ISD::ATOMIC_STORE: if (cast(Op)->getMemoryVT() == MVT::i128) { - assert(Subtarget->hasLSE2()); + assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3()); return LowerStore128(Op, DAG); } return SDValue(); @@ -22069,9 +22079,16 @@ } if (SDValue(N, 0).getValueType() == MVT::i128) { + auto *AN = dyn_cast(LoadNode); + bool isLoadAcquire = + AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire; + unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP; + + if (isLoadAcquire) + assert(Subtarget->hasFeature(AArch64::FeatureRCPC3)); + SDValue Result = DAG.getMemIntrinsicNode( - AArch64ISD::LDP, SDLoc(N), - DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), + Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), LoadNode->getMemOperand()); @@ -22194,8 +22211,28 @@ return false; } +bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction *I) const { + if (!Subtarget->hasRCPC3()) + return false; + + if (auto LI = dyn_cast(I)) + return LI->getType()->getPrimitiveSizeInBits() == 128 && + LI->getAlign() >= Align(16) && + LI->getOrdering() == AtomicOrdering::Acquire; + + if (auto SI = dyn_cast(I)) + return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && + SI->getAlign() >= Align(16) && + SI->getOrdering() == AtomicOrdering::Release; + + return false; +} + bool AArch64TargetLowering::shouldInsertFencesForAtomic( const Instruction *I) const { + // Inserting fences changes the load/store ordering to monotonic. + if (isOpSuitableForRCPC3(I)) + return false; return isOpSuitableForLDPSTP(I); } @@ -22205,7 +22242,7 @@ TargetLoweringBase::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); - if (Size != 128 || isOpSuitableForLDPSTP(SI)) + if (Size != 128 || isOpSuitableForLDPSTP(SI) || isOpSuitableForRCPC3(SI)) return AtomicExpansionKind::None; return AtomicExpansionKind::Expand; } @@ -22217,7 +22254,7 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - if (Size != 128 || isOpSuitableForLDPSTP(LI)) + if (Size != 128 || isOpSuitableForLDPSTP(LI) || isOpSuitableForRCPC3(LI)) return AtomicExpansionKind::None; // At -O0, fast-regalloc cannot cope with the live vregs necessary to diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -362,8 +362,10 @@ def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64ldiapp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64stilp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; // Generates the general dynamic sequences, i.e. @@ -786,8 +788,10 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>; def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def AArch64ldiapp : SDNode<"AArch64ISD::LDIAPP", SDT_AArch64ldiapp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def AArch64ldnp : SDNode<"AArch64ISD::LDNP", SDT_AArch64ldnp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; @@ -3408,7 +3412,7 @@ def STPDpre : StorePairPreIdx<0b01, 1, FPR64Op, simm7s8, "stp">; def STPQpre : StorePairPreIdx<0b10, 1, FPR128Op, simm7s16, "stp">; -// Pair (pre-indexed) +// Pair (post-indexed) def STPWpost : StorePairPostIdx<0b00, 0, GPR32z, simm7s4, "stp">; def STPXpost : StorePairPostIdx<0b10, 0, GPR64z, simm7s8, "stp">; def STPSpost : StorePairPostIdx<0b00, 1, FPR32Op, simm7s4, "stp">; @@ -8684,6 +8688,9 @@ def LDIAPPW: BaseLRCPC3IntegerLoadStorePair<0b10, 0b01, 0b0001, (outs GPR32:$Rt, GPR32:$Rt2), (ins GPR64sp0:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn]", "">; def LDIAPPX: BaseLRCPC3IntegerLoadStorePair<0b11, 0b01, 0b0001, (outs GPR64:$Rt, GPR64:$Rt2), (ins GPR64sp0:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn]", "">; + def : Pat<(AArch64ldiapp GPR64sp:$Rn), (LDIAPPX GPR64sp:$Rn)>; + def : Pat<(AArch64stilp GPR64:$Rt, GPR64:$Rt2, GPR64sp:$Rn), (STILPX GPR64:$Rt, GPR64:$Rt2, GPR64sp:$Rn)>; + // Aliases for when offset=0 def : InstAlias<"stilp\t$Rt, $Rt2, [$Rn, #0]", (STILPW GPR32: $Rt, GPR32: $Rt2, GPR64sp:$Rn)>; def : InstAlias<"stilp\t$Rt, $Rt2, [$Rn, #0]", (STILPX GPR64: $Rt, GPR64: $Rt2, GPR64sp:$Rn)>; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -309,6 +309,10 @@ }; getActionDefinitionsBuilder(G_LOAD) + .customIf([&](const LegalityQuery &Query) { + return ST.hasRCPC3() && Query.Types[0] == s128 && + Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire; + }) .customIf([=](const LegalityQuery &Query) { return Query.Types[0] == s128 && Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; @@ -347,6 +351,10 @@ .scalarizeIf(typeIs(0, v2s16), 0); getActionDefinitionsBuilder(G_STORE) + .customIf([&](const LegalityQuery &Query) { + return ST.hasRCPC3() && Query.Types[0] == s128 && + Query.MMODescrs[0].Ordering == AtomicOrdering::Release; + }) .customIf([=](const LegalityQuery &Query) { return Query.Types[0] == s128 && Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; @@ -1188,26 +1196,47 @@ const LLT ValTy = MRI.getType(ValReg); if (ValTy == LLT::scalar(128)) { - assert((*MI.memoperands_begin())->getSuccessOrdering() == - AtomicOrdering::Monotonic || - (*MI.memoperands_begin())->getSuccessOrdering() == - AtomicOrdering::Unordered); - assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2"); + + AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering(); + bool isLoad = MI.getOpcode() == TargetOpcode::G_LOAD; + bool isLoadAcquire = isLoad && Ordering == AtomicOrdering::Acquire; + bool isStoreRelease = !isLoad && Ordering == AtomicOrdering::Release; + bool isRCPC3 = ST->hasRCPC3() && (isLoadAcquire || isStoreRelease); + LLT s64 = LLT::scalar(64); + + unsigned Opcode; + if (isRCPC3) { + Opcode = isLoad ? AArch64::LDIAPPX : AArch64::STILPX; + } else { + // For LSE2, loads/stores should have been converted to monotonic and had + // a fence inserted after them. + assert(Ordering == AtomicOrdering::Monotonic || + Ordering == AtomicOrdering::Unordered); + assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2"); + + Opcode = isLoad ? AArch64::LDPXi : Opcode = AArch64::STPXi; + } + MachineInstrBuilder NewI; - if (MI.getOpcode() == TargetOpcode::G_LOAD) { - NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {}); + if (isLoad) { + NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {}); MIRBuilder.buildMerge(ValReg, {NewI->getOperand(0), NewI->getOperand(1)}); } else { auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0)); NewI = MIRBuilder.buildInstr( - AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)}); + Opcode, {}, {Split->getOperand(0), Split->getOperand(1)}); + } + + if (isRCPC3) { + NewI.addUse(MI.getOperand(1).getReg()); + } else { + Register Base; + int Offset; + matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); + NewI.addUse(Base); + NewI.addImm(Offset / 8); } - Register Base; - int Offset; - matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); - NewI.addUse(Base); - NewI.addImm(Offset / 8); NewI.cloneMemRefs(MI); constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(), diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc3.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc3.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc3.ll @@ -288,31 +288,15 @@ } define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) { -; -O0-LABEL: load_atomic_i128_aligned_acquire: -; -O0: ldaxp x0, x1, [x9] -; -O0: cmp x0, x10 -; -O0: cmp x1, x10 -; -O0: stxp w8, x10, x10, [x9] -; -O0: stxp w8, x0, x1, [x9] -; -; -O1-LABEL: load_atomic_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; CHECK-LABEL: load_atomic_i128_aligned_acquire: +; CHECK: ldiapp x0, x1, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) { -; -O0-LABEL: load_atomic_i128_aligned_acquire_const: -; -O0: ldaxp x0, x1, [x9] -; -O0: cmp x0, x10 -; -O0: cmp x1, x10 -; -O0: stxp w8, x10, x10, [x9] -; -O0: stxp w8, x0, x1, [x9] -; -; -O1-LABEL: load_atomic_i128_aligned_acquire_const: -; -O1: ldaxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; CHECK-LABEL: load_atomic_i128_aligned_acquire_const: +; CHECK: ldiapp x0, x1, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc3.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc3.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc3.ll @@ -154,20 +154,8 @@ } define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) { -; -O0-LABEL: store_atomic_i128_aligned_release: -; -O0: ldxp x10, x9, [x11] -; -O0: cmp x10, x12 -; -O0: cmp x9, x13 -; -O0: stlxp w8, x14, x15, [x11] -; -O0: stlxp w8, x10, x9, [x11] -; -O0: eor x8, x10, x8 -; -O0: eor x11, x9, x11 -; -O0: orr x8, x8, x11 -; -O0: subs x8, x8, #0 -; -; -O1-LABEL: store_atomic_i128_aligned_release: -; -O1: ldxp xzr, x8, [x2] -; -O1: stlxp w8, x0, x1, [x2] +; CHECK-LABEL: store_atomic_i128_aligned_release: +; CHECK: stilp x0, x1, [x2] store atomic i128 %value, ptr %ptr release, align 16 ret void } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll @@ -288,31 +288,15 @@ } define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) { -; -O0-LABEL: load_atomic_i128_aligned_acquire: -; -O0: ldaxp x1, x0, [x9] -; -O0: cmp x1, x10 -; -O0: cmp x0, x10 -; -O0: stxp w8, x10, x10, [x9] -; -O0: stxp w8, x1, x0, [x9] -; -; -O1-LABEL: load_atomic_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; CHECK-LABEL: load_atomic_i128_aligned_acquire: +; CHECK: ldiapp x1, x0, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) { -; -O0-LABEL: load_atomic_i128_aligned_acquire_const: -; -O0: ldaxp x1, x0, [x9] -; -O0: cmp x1, x10 -; -O0: cmp x0, x10 -; -O0: stxp w8, x10, x10, [x9] -; -O0: stxp w8, x1, x0, [x9] -; -; -O1-LABEL: load_atomic_i128_aligned_acquire_const: -; -O1: ldaxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; CHECK-LABEL: load_atomic_i128_aligned_acquire_const: +; CHECK: ldiapp x1, x0, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll @@ -150,18 +150,8 @@ } define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) { -; -O0-LABEL: store_atomic_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq -; -; -O1-LABEL: store_atomic_i128_aligned_release: -; -O1: ldxp xzr, x8, [x2] -; -O1: stlxp w8, x1, x0, [x2] +; CHECK-LABEL: store_atomic_i128_aligned_release: +; CHECK: stilp x1, x0, [x2] store atomic i128 %value, ptr %ptr release, align 16 ret void }