diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -1665,6 +1665,9 @@ /// ordered or volatile memory references. bool hasOrderedMemoryRef() const; + /// Return true if this instruction has a volatile memory reference. + bool hasVolatileMemoryRef() const; + /// Return true if this load instruction never traps and points to a memory /// location whose value doesn't change during the execution of this function. /// diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -1405,6 +1405,14 @@ }); } +/// Return true if this instruction has a volatile memory reference. +bool MachineInstr::hasVolatileMemoryRef() const { + // Check if any of our memory operands are ordered. + return llvm::any_of(memoperands(), [](const MachineMemOperand *MMO) { + return MMO->isVolatile(); + }); +} + /// isDereferenceableInvariantLoad - Return true if this instruction will never /// trap and is loading from a location whose value is invariant across a run of /// this function. diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -107,9 +107,21 @@ /// Returns the base register operator of a load/store. static const MachineOperand &getLdStBaseOp(const MachineInstr &MI); - /// Returns the the immediate offset operator of a load/store. + /// Returns the immediate offset operator of a load/store. static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI); + /// Returns the offset directly when the instruction has no offset operand. + static int64_t getLdStOffset(const MachineInstr &MI); + + /// Return whether this load/store has an offset operand. + static bool hasLdStOffsetOp(const MachineInstr &MI); + + /// Return whether a pre/post indexed variant of this instruction exists. + /// Used by isCandidateToMergeOrPair, which otherwise uses the + /// existence/position of immediate operands to decide whether an op is + /// suitable for merging/pairing. + static bool hasRCPC3PrePostIndexVariant(const MachineInstr &MI); + /// Returns whether the instruction is FP or NEON. static bool isFpOrNEON(const MachineInstr &MI); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2210,6 +2210,14 @@ case AArch64::LDURBBi: case AArch64::LDURSBWi: case AArch64::LDURSHWi: + case AArch64::LDIAPPWpre: + case AArch64::LDIAPPXpre: + case AArch64::STILPWpre: + case AArch64::STILPXpre: + case AArch64::LDAPRWpre: + case AArch64::LDAPRXpre: + case AArch64::STLRWpre: + case AArch64::STLRXpre: return true; } } @@ -2392,6 +2400,10 @@ case AArch64::LDRXui: case AArch64::LDRWui: case AArch64::LDRSWui: + case AArch64::LDAPRW: + case AArch64::LDAPRX: + case AArch64::STLRW: + case AArch64::STLRX: // Unscaled instructions. case AArch64::STURSi: case AArch64::STRSpre: @@ -2515,9 +2527,20 @@ bool IsPreLdSt = isPreLdSt(MI); // If this is a volatile load/store, don't mess with it. - if (MI.hasOrderedMemoryRef()) + if (MI.hasVolatileMemoryRef() || MI.memoperands_empty()) return false; + // The only ordered loads/stores that we consider here are those in RCPC3. + if (!hasRCPC3PrePostIndexVariant(MI)) { + for (const MachineMemOperand *MMO : MI.memoperands()) + if (!MMO->isUnordered()) + return false; + } + + // FIXME temporary to see what this hits besides the instructions we're adding + if (MI.hasOrderedMemoryRef()) + assert(hasRCPC3PrePostIndexVariant(MI)); + // Make sure this is a reg/fi+imm (as opposed to an address reloc). // For Pre-inc LD/ST, the operand is shifted by one. assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() || @@ -2526,9 +2549,11 @@ // For Pre-indexed addressing quadword instructions, the third operand is the // immediate value. - bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm(); + bool IsImmPreLdSt = + IsPreLdSt && MI.getNumOperands() >= 4 && MI.getOperand(3).isImm(); + bool IsTypicalCase = MI.getNumOperands() >= 3 && MI.getOperand(2).isImm(); - if (!MI.getOperand(2).isImm() && !IsImmPreLdSt) + if (!IsTypicalCase && !IsImmPreLdSt && !hasRCPC3PrePostIndexVariant(MI)) return false; // Can't merge/pair if the instruction modifies the base register. @@ -3098,6 +3123,14 @@ case AArch64::LDURSBWi: case AArch64::STRBBui: case AArch64::STURBBi: + case AArch64::LDIAPPW: + case AArch64::LDIAPPX: + case AArch64::STILPW: + case AArch64::STILPX: + case AArch64::LDAPRW: + case AArch64::LDAPRX: + case AArch64::STLRW: + case AArch64::STLRX: return 1; case AArch64::LDRHHui: case AArch64::LDURHHi: @@ -3194,12 +3227,16 @@ switch (MI.getOpcode()) { default: return false; + case AArch64::LDIAPPW: + case AArch64::LDIAPPX: case AArch64::LDPSi: case AArch64::LDPSWi: case AArch64::LDPDi: case AArch64::LDPQi: case AArch64::LDPWi: case AArch64::LDPXi: + case AArch64::STILPW: + case AArch64::STILPX: case AArch64::STPSi: case AArch64::STPDi: case AArch64::STPQi: @@ -3217,14 +3254,44 @@ return MI.getOperand(Idx); } +bool AArch64InstrInfo::hasRCPC3PrePostIndexVariant(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + // Added by FEAT_RCPC3 + case AArch64::LDIAPPW: + case AArch64::STILPW: + case AArch64::LDIAPPX: + case AArch64::STILPX: + // Pre-existing, but FEAT_LRCPC3 added pre/post indexed versions + case AArch64::LDAPRW: + case AArch64::LDAPRX: + case AArch64::STLRW: + case AArch64::STLRX: + return true; + } +} + +bool AArch64InstrInfo::hasLdStOffsetOp(const MachineInstr &MI) { + // Currently only the FEAT_LRCPC3 instructions don't have an index operand. + return !hasRCPC3PrePostIndexVariant(MI); +} + const MachineOperand & AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { + assert(hasLdStOffsetOp(MI)); unsigned Idx = AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 : 2; return MI.getOperand(Idx); } +int64_t AArch64InstrInfo::getLdStOffset(const MachineInstr &MI) { + if (!hasLdStOffsetOp(MI)) + return 0; + return AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); +} + static const TargetRegisterClass *getRegClass(const MachineInstr &MI, Register Reg) { if (MI.getParent() == nullptr) diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -21,6 +21,7 @@ #include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -475,6 +476,14 @@ return AArch64::STZ2GPreIndex; case AArch64::STGPi: return AArch64::STGPpre; + case AArch64::STILPW: + return AArch64::STILPWpre; + case AArch64::STILPX: + return AArch64::STILPXpre; + case AArch64::STLRW: + return AArch64::STLRWpre; + case AArch64::STLRX: + return AArch64::STLRXpre; } } @@ -534,6 +543,18 @@ return AArch64::LDPWpost; case AArch64::LDPXi: return AArch64::LDPXpost; + case AArch64::LDIAPPW: + return AArch64::LDIAPPWpre; + case AArch64::LDIAPPX: + return AArch64::LDIAPPXpre; + case AArch64::LDAPRW: + return AArch64::LDAPRWpre; + case AArch64::LDAPRX: + return AArch64::LDAPRXpre; + case AArch64::STLRW: + return AArch64::STLRWpre; + case AArch64::STLRX: + return AArch64::STLRXpre; case AArch64::STPSi: return AArch64::STPSpost; case AArch64::STPDi: @@ -588,9 +609,53 @@ } } +static bool getRCPC3MemOpInfo(const MachineInstr &MI, int &Scale, + int &MinOffset, int &MaxOffset) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::LDIAPPW: + Scale = 1; + MinOffset = MaxOffset = 8; + return true; + case AArch64::STILPW: + Scale = 1; + MinOffset = MaxOffset = -8; + return true; + case AArch64::LDIAPPX: + Scale = 1; + MinOffset = MaxOffset = 16; + return true; + case AArch64::STILPX: + Scale = 1; + MinOffset = MaxOffset = -16; + return true; + case AArch64::LDAPRW: + Scale = 1; + MinOffset = MaxOffset = 4; + return true; + case AArch64::STLRWpre: + Scale = 1; + MinOffset = MaxOffset = -4; + return true; + case AArch64::LDAPRX: + Scale = 1; + MinOffset = MaxOffset = 8; + return true; + case AArch64::STLRXpre: + Scale = 1; + MinOffset = MaxOffset = -8; + return true; + } +} + // Returns the scale and offset range of pre/post indexed variants of MI. static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, int &MinOffset, int &MaxOffset) { + // Special case for LDIAPP/STILP which accept only particular offsets + if (getRCPC3MemOpInfo(MI, Scale, MinOffset, MaxOffset)) + return; + bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI); bool IsTagStore = isTagStore(MI); // ST*G and all paired ldst have the same scale in pre/post-indexed variants @@ -663,6 +728,8 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); + if (AArch64InstrInfo::hasRCPC3PrePostIndexVariant(MI)) + return true; switch (Opc) { default: return false; @@ -712,7 +779,6 @@ // Make sure this is a reg+imm (as opposed to an address reloc). if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) return false; - return true; } } @@ -1287,11 +1353,13 @@ LdStPairFlags &Flags, const AArch64InstrInfo *TII) { // If this is volatile or if pairing is suppressed, not a candidate. - if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) + if (MI.hasVolatileMemoryRef() || MI.memoperands_empty()) + return false; + if (TII->isLdStPairSuppressed(MI)) return false; // We should have already checked FirstMI for pair suppression and volatility. - assert(!FirstMI.hasOrderedMemoryRef() && + assert(!FirstMI.hasVolatileMemoryRef() && !TII->isLdStPairSuppressed(FirstMI) && "FirstMI shouldn't get here if either of these checks are true."); @@ -1306,6 +1374,12 @@ if (OpcA == OpcB) return !AArch64InstrInfo::isPreLdSt(FirstMI); + // For RCPC3, we only merge pairs with matching opcodes. + if (AArch64InstrInfo::hasRCPC3PrePostIndexVariant(FirstMI)) { + assert(OpcA != OpcB); + return false; + } + // Try to match a sign-extended load/store with a zero-extended load/store. bool IsValidLdStrOpc, PairIsValidLdStrOpc; unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); @@ -1528,7 +1602,7 @@ bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI); Register Reg = getLdStRegOp(FirstMI).getReg(); Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg(); - int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm(); + int Offset = AArch64InstrInfo::getLdStOffset(FirstMI); int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1; bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); @@ -1563,6 +1637,7 @@ Flags.setSExtIdx(-1); if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && + AArch64InstrInfo::hasLdStOffsetOp(MI) && AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) { assert(MI.mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see @@ -1572,7 +1647,7 @@ // actually an immediate and not a symbolic reference destined for // a relocation. Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg(); - int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); + int MIOffset = AArch64InstrInfo::getLdStOffset(MI); bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI); if (IsUnscaled != MIIsUnscaled) { // We're trying to pair instructions that differ in how they are scaled. @@ -1819,9 +1894,10 @@ .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I)) .add(AArch64InstrInfo::getLdStBaseOp(*I)) - .addImm(Value / Scale) .setMemRefs(I->memoperands()) .setMIFlags(I->mergeFlagsWith(*Update)); + if (AArch64InstrInfo::hasLdStOffsetOp(*I)) + MIB.addImm(Value / Scale); } else { // Paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) @@ -1829,9 +1905,10 @@ .add(getLdStRegOp(*I, 0)) .add(getLdStRegOp(*I, 1)) .add(AArch64InstrInfo::getLdStBaseOp(*I)) - .addImm(Value / Scale) .setMemRefs(I->memoperands()) .setMIFlags(I->mergeFlagsWith(*Update)); + if (AArch64InstrInfo::hasLdStOffsetOp(*I)) + MIB.addImm(Value / Scale); } if (CFI != E) { MachineBasicBlock *MBB = I->getParent(); @@ -1914,8 +1991,9 @@ MachineBasicBlock::iterator MBBI = I; Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); - int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() * - TII->getMemScale(MemMI); + + int MIUnscaledOffset = + AArch64InstrInfo::getLdStOffset(MemMI) * TII->getMemScale(MemMI); // Scan forward looking for post-index opportunities. Updating instructions // can't be formed if the memory instruction doesn't have the offset we're @@ -1992,7 +2070,7 @@ MachineFunction &MF = *MemMI.getMF(); Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); - int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm(); + int Offset = AArch64InstrInfo::getLdStOffset(MemMI); // If the load/store is the first instruction in the block, there's obviously // not any matching update. Ditto if the memory offset isn't zero. @@ -2128,7 +2206,7 @@ // range, plus allow an extra one in case we find a later insn that matches // with Offset-1) bool IsUnscaled = TII->hasUnscaledLdStOffset(MI); - int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); + int Offset = AArch64InstrInfo::getLdStOffset(MI); int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1; // Allow one more for offset. if (Offset > 0) @@ -2196,7 +2274,7 @@ // operation. The immediate in the add we're looking for, // however, is not, so adjust here. int UnscaledOffset = - AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); + AArch64InstrInfo::getLdStOffset(MI) * TII->getMemScale(MI); // Look forward to try to find a pre-index instruction. For example, // ldr x1, [x0, #64] diff --git a/llvm/test/CodeGen/AArch64/aarch64-rcpc3-ldst.ll b/llvm/test/CodeGen/AArch64/aarch64-rcpc3-ldst.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-rcpc3-ldst.ll @@ -0,0 +1,396 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "^\s*(lda|ldia|stl|stil)" + +; RUN: llc %s -o - -mtriple=aarch64-none-linux-gnu -O1 -mattr=+v8.1a,+rcpc,+rcpc-immo,+rcpc3 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-NO-LSE2 +; RUN: llc %s -o - -mtriple=aarch64-none-linux-gnu -O1 -mattr=+v8.1a,+rcpc,+rcpc-immo,+rcpc3 | FileCheck %s --check-prefixes=SDAG-NO-LSE2 +; RUN: llc %s -o - -mtriple=aarch64-none-linux-gnu -O1 -mattr=+v8.1a,+lse2,+rcpc,+rcpc-immo,+rcpc3 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-LSE2 +; RUN: llc %s -o - -mtriple=aarch64-none-linux-gnu -O1 -mattr=+v8.1a,+lse2,+rcpc,+rcpc-immo,+rcpc3 | FileCheck %s --check-prefixes=SDAG-LSE2 +; RUN: llc %s -o - -mtriple=aarch64_be-none-linux-gnu -O1 -mattr=+v8.1a,+rcpc,+rcpc-immo,+rcpc3 | FileCheck %s --check-prefixes=SDAG-NO-LSE2 +; RUN: llc %s -o - -mtriple=aarch64_be-none-linux-gnu -O1 -mattr=+v8.1a,+lse2,+rcpc,+rcpc-immo,+rcpc3 | FileCheck %s --check-prefixes=SDAG-LSE2 + +; TODO: +; SelDAG implementation +; merge two ordered load-aqcuires into a load-acquire pair +; merging of pre/post indexed variants +; LDIAPP only generated if we have LSE +; LSE2 makes pair operations single-copy atomic for naturally aligned + +; Note: +; Loading the pointer indirectly results in an update operation (add/sub) +; which reuses the same src/dst register (x8). +; Doing getelementptr on the input ptr directly results in `x8 = add blah`. + +; RCPC3 lets us merge two 32/64 bit atomic loads into a single one. +define dso_local void @load_store_2xi32_rcpc3(ptr %ptr2ptr) { +; GISEL-NO-LSE2-LABEL: load_store_2xi32_rcpc3: +; GISEL-NO-LSE2: ldapr w10, [x8] +; GISEL-NO-LSE2: ldapr w11, [x9] +; GISEL-NO-LSE2: stlr w10, [x8] +; GISEL-NO-LSE2: stlr w11, [x9] +; +; SDAG-NO-LSE2-LABEL: load_store_2xi32_rcpc3: +; SDAG-NO-LSE2: ldapr w10, [x8] +; SDAG-NO-LSE2: ldapr w11, [x9] +; SDAG-NO-LSE2: stlr w10, [x8] +; SDAG-NO-LSE2: stlr w11, [x9] +; +; GISEL-LSE2-LABEL: load_store_2xi32_rcpc3: +; GISEL-LSE2: ldapr w10, [x8] +; GISEL-LSE2: ldapr w11, [x9] +; GISEL-LSE2: stlr w10, [x8] +; GISEL-LSE2: stlr w11, [x9] +; +; SDAG-LSE2-LABEL: load_store_2xi32_rcpc3: +; SDAG-LSE2: ldapr w10, [x8] +; SDAG-LSE2: ldapr w11, [x9] +; SDAG-LSE2: stlr w10, [x8] +; SDAG-LSE2: stlr w11, [x9] + %ptr1 = load ptr, ptr %ptr2ptr + %ptr2 = getelementptr ptr, ptr %ptr1, i32 1 + + %a1 = load atomic i32, ptr %ptr1 acquire, align 8 + %a2 = load atomic i32, ptr %ptr2 acquire, align 8 + + %b1 = add i32 %a1, %a1 + %b2 = add i32 %a2, %a2 + + store atomic i32 %b1, ptr %ptr1 release, align 8 + store atomic i32 %b2, ptr %ptr2 release, align 8 + + ret void +} + +define dso_local void @load_store_2xi64_rcpc3(ptr %ptr2ptr) { +; GISEL-NO-LSE2-LABEL: load_store_2xi64_rcpc3: +; GISEL-NO-LSE2: ldapr x10, [x8] +; GISEL-NO-LSE2: ldapr x11, [x9] +; GISEL-NO-LSE2: stlr x10, [x8] +; GISEL-NO-LSE2: stlr x11, [x9] +; +; SDAG-NO-LSE2-LABEL: load_store_2xi64_rcpc3: +; SDAG-NO-LSE2: ldapr x10, [x8] +; SDAG-NO-LSE2: ldapr x11, [x9] +; SDAG-NO-LSE2: stlr x10, [x8] +; SDAG-NO-LSE2: stlr x11, [x9] +; +; GISEL-LSE2-LABEL: load_store_2xi64_rcpc3: +; GISEL-LSE2: ldapr x10, [x8] +; GISEL-LSE2: ldapr x11, [x9] +; GISEL-LSE2: stlr x10, [x8] +; GISEL-LSE2: stlr x11, [x9] +; +; SDAG-LSE2-LABEL: load_store_2xi64_rcpc3: +; SDAG-LSE2: ldapr x10, [x8] +; SDAG-LSE2: ldapr x11, [x9] +; SDAG-LSE2: stlr x10, [x8] +; SDAG-LSE2: stlr x11, [x9] + %ptr1 = load ptr, ptr %ptr2ptr + %ptr2 = getelementptr ptr, ptr %ptr1, i64 1 + + %a1 = load atomic i64, ptr %ptr1 acquire, align 8 + %a2 = load atomic i64, ptr %ptr2 acquire, align 8 + + %b1 = add i64 %a1, %a1 + %b2 = add i64 %a2, %a2 + + store atomic i64 %b1, ptr %ptr1 release, align 8 + store atomic i64 %b2, ptr %ptr2 release, align 8 + + ret void +} + +; TODO Same again but with offsets + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Register Pair Ordered - Load Acquire RCpc / Store Release ; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; RCPC3 + LSE2 gives us 128-bit single-copy atomics. +define dso_local void @load_atomic_i128_no_offset(ptr %ptr2ptr) { +; GISEL-NO-LSE2-LABEL: load_atomic_i128_no_offset: +; GISEL-NO-LSE2: ldiapp xzr, x8, [x8] +; +; SDAG-NO-LSE2-LABEL: load_atomic_i128_no_offset: +; SDAG-NO-LSE2: ldiapp xzr, x8, [x8] +; +; GISEL-LSE2-LABEL: load_atomic_i128_no_offset: +; GISEL-LSE2: ldiapp xzr, x8, [x8] +; +; SDAG-LSE2-LABEL: load_atomic_i128_no_offset: +; SDAG-LSE2: ldiapp xzr, x8, [x8] + %ptr = load ptr, ptr %ptr2ptr + %a = load atomic i128, ptr %ptr acquire, align 16 + ret void +} + +define dso_local void @store_atomic_i128_no_offset(ptr %ptr2ptr) { +; GISEL-NO-LSE2-LABEL: store_atomic_i128_no_offset: +; GISEL-NO-LSE2: stilp x8, xzr, [x9] +; +; SDAG-NO-LSE2-LABEL: store_atomic_i128_no_offset: +; SDAG-NO-LSE2: stilp x8, xzr, [x9] +; +; GISEL-LSE2-LABEL: store_atomic_i128_no_offset: +; GISEL-LSE2: stilp x8, xzr, [x9] +; +; SDAG-LSE2-LABEL: store_atomic_i128_no_offset: +; SDAG-LSE2: stilp x8, xzr, [x9] + %ptr = load ptr, ptr %ptr2ptr + store atomic i128 1, ptr %ptr release, align 16 + ret void +} + +; Same again with pre/post indexing. +define dso_local void @load_atomic_i128_offset_16(ptr %base_ptr) { +; GISEL-NO-LSE2-LABEL: load_atomic_i128_offset_16: +; GISEL-NO-LSE2: ldiapp x8, x9, [x0], #16 +; +; SDAG-NO-LSE2-LABEL: load_atomic_i128_offset_16: +; SDAG-NO-LSE2: ldiapp x8, x9, [x0], #16 +; +; GISEL-LSE2-LABEL: load_atomic_i128_offset_16: +; GISEL-LSE2: ldiapp x8, x9, [x0], #16 +; +; SDAG-LSE2-LABEL: load_atomic_i128_offset_16: +; SDAG-LSE2: ldiapp x8, x9, [x0], #16 +entry: + br label %body + +body: + %ptr = phi ptr [ %offset_ptr, %body ], [ %base_ptr, %entry ] + + ; Machine scheduler has a tendency to move the ADD before the LOAD. + ; Hence the loop, to ensure the ADD reuses one register for src and dst. + %val = load atomic i128, ptr %ptr acquire, align 16 + %offset_ptr = getelementptr i64, ptr %ptr, i64 2 + + ; %val is used for %cond to ensure the LOAD is not removed. + %cond = icmp eq i128 %val, 0 + br i1 %cond, label %exit, label %body + +exit: + ret void +} + +define dso_local void @store_atomic_i128_offset_16(ptr %ptr2ptr) { +; GISEL-NO-LSE2-LABEL: store_atomic_i128_offset_16: +; GISEL-NO-LSE2: stilp x8, xzr, [x9, #-16]! +; +; SDAG-NO-LSE2-LABEL: store_atomic_i128_offset_16: +; SDAG-NO-LSE2: stilp x8, xzr, [x9, #-16]! +; +; GISEL-LSE2-LABEL: store_atomic_i128_offset_16: +; GISEL-LSE2: stilp x8, xzr, [x9, #-16]! +; +; SDAG-LSE2-LABEL: store_atomic_i128_offset_16: +; SDAG-LSE2: stilp x8, xzr, [x9, #-16]! + %ptr = load ptr, ptr %ptr2ptr + %ptr_wb = getelementptr i64, ptr %ptr, i64 -2 + store atomic i128 1, ptr %ptr_wb release, align 16 + ret void +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Single register - Load Acquire RCpc ; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; LDAPR ,[],#8 Load Acquire RCpc Register, post index(8) with writeback +; LDAPR ,[],#4 Load Acquire RCpc Register, post index(4) with writeback + +define dso_local i64 @load_atomic_i64_no_offset(ptr %ptr2ptr) { +; GISEL-NO-LSE2-LABEL: load_atomic_i64_no_offset: +; GISEL-NO-LSE2: ldapr x0, [x8] +; +; SDAG-NO-LSE2-LABEL: load_atomic_i64_no_offset: +; SDAG-NO-LSE2: ldapr x0, [x8] +; +; GISEL-LSE2-LABEL: load_atomic_i64_no_offset: +; GISEL-LSE2: ldapr x0, [x8] +; +; SDAG-LSE2-LABEL: load_atomic_i64_no_offset: +; SDAG-LSE2: ldapr x0, [x8] + %ptr = load ptr, ptr %ptr2ptr + %a = load atomic i64, ptr %ptr acquire, align 8 + ret i64 %a +} + +define dso_local i32 @load_atomic_i32_no_offset(ptr %ptr2ptr) { +; GISEL-NO-LSE2-LABEL: load_atomic_i32_no_offset: +; GISEL-NO-LSE2: ldapr w0, [x8] +; +; SDAG-NO-LSE2-LABEL: load_atomic_i32_no_offset: +; SDAG-NO-LSE2: ldapr w0, [x8] +; +; GISEL-LSE2-LABEL: load_atomic_i32_no_offset: +; GISEL-LSE2: ldapr w0, [x8] +; +; SDAG-LSE2-LABEL: load_atomic_i32_no_offset: +; SDAG-LSE2: ldapr w0, [x8] + %ptr = load ptr, ptr %ptr2ptr + %a = load atomic i32, ptr %ptr acquire, align 4 + ret i32 %a +} + +define dso_local void @load_atomic_i64_offset_8(ptr %base_ptr) { +; GISEL-NO-LSE2-LABEL: load_atomic_i64_offset_8: +; GISEL-NO-LSE2: ldapr x8, [x0], #8 +; +; SDAG-NO-LSE2-LABEL: load_atomic_i64_offset_8: +; SDAG-NO-LSE2: ldapr x8, [x0], #8 +; +; GISEL-LSE2-LABEL: load_atomic_i64_offset_8: +; GISEL-LSE2: ldapr x8, [x0], #8 +; +; SDAG-LSE2-LABEL: load_atomic_i64_offset_8: +; SDAG-LSE2: ldapr x8, [x0], #8 +entry: + br label %body + +body: + %ptr = phi ptr [ %offset_ptr, %body ], [ %base_ptr, %entry ] + + ; Machine scheduler has a tendency to move the ADD before the LOAD. + ; Hence the loop, to ensure the ADD reuses one register for src and dst. + %val = load atomic i64, ptr %ptr acquire, align 8 + %offset_ptr = getelementptr i64, ptr %ptr, i64 1 + + ; %val is used for %cond to ensure the LOAD is not removed. + %cond = icmp eq i64 %val, 0 + br i1 %cond, label %exit, label %body + +exit: + ret void +} + +define dso_local void @load_atomic_i32_offset_4(ptr %base_ptr) { +; GISEL-NO-LSE2-LABEL: load_atomic_i32_offset_4: +; GISEL-NO-LSE2: ldapr w8, [x0], #4 +; +; SDAG-NO-LSE2-LABEL: load_atomic_i32_offset_4: +; SDAG-NO-LSE2: ldapr w8, [x0], #4 +; +; GISEL-LSE2-LABEL: load_atomic_i32_offset_4: +; GISEL-LSE2: ldapr w8, [x0], #4 +; +; SDAG-LSE2-LABEL: load_atomic_i32_offset_4: +; SDAG-LSE2: ldapr w8, [x0], #4 +entry: + br label %body + +body: + %ptr = phi ptr [ %offset_ptr, %body ], [ %base_ptr, %entry ] + + ; Machine scheduler has a tendency to move the ADD before the LOAD. + ; Hence the loop, to ensure the ADD reuses one register for src and dst. + %val = load atomic i32, ptr %ptr acquire, align 4 + %offset_ptr = getelementptr i32, ptr %ptr, i64 1 + + ; %val is used for %cond to ensure the LOAD is not removed. + %cond = icmp eq i32 %val, 0 + br i1 %cond, label %exit, label %body + +exit: + ret void +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Single register – Store Release ; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; STLR ,[, #-8]! Store Release Register, negative pre index(8) with writeback +; STLR ,[, #-4]! Store Release Register, negative pre index(4) with writeback + +define dso_local void @store_atomic_i32_no_offset(ptr %ptr2ptr) { +; GISEL-NO-LSE2-LABEL: store_atomic_i32_no_offset: +; GISEL-NO-LSE2: stlr w8, [x9] +; +; SDAG-NO-LSE2-LABEL: store_atomic_i32_no_offset: +; SDAG-NO-LSE2: stlr w8, [x9] +; +; GISEL-LSE2-LABEL: store_atomic_i32_no_offset: +; GISEL-LSE2: stlr w8, [x9] +; +; SDAG-LSE2-LABEL: store_atomic_i32_no_offset: +; SDAG-LSE2: stlr w8, [x9] + %ptr = load ptr, ptr %ptr2ptr + store atomic i32 1, ptr %ptr release, align 4 + ret void +} + +define dso_local void @store_atomic_i64_no_offset(ptr %ptr2ptr) { +; GISEL-NO-LSE2-LABEL: store_atomic_i64_no_offset: +; GISEL-NO-LSE2: stlr x8, [x9] +; +; SDAG-NO-LSE2-LABEL: store_atomic_i64_no_offset: +; SDAG-NO-LSE2: stlr x8, [x9] +; +; GISEL-LSE2-LABEL: store_atomic_i64_no_offset: +; GISEL-LSE2: stlr x8, [x9] +; +; SDAG-LSE2-LABEL: store_atomic_i64_no_offset: +; SDAG-LSE2: stlr x8, [x9] + %ptr = load ptr, ptr %ptr2ptr + store atomic i64 1, ptr %ptr release, align 8 + ret void +} + + +define dso_local void @store_atomic_i32_offset_4(ptr %ptr2ptr) { +; GISEL-NO-LSE2-LABEL: store_atomic_i32_offset_4: +; GISEL-NO-LSE2: stlr w8, [x9, #-4]! +; +; SDAG-NO-LSE2-LABEL: store_atomic_i32_offset_4: +; SDAG-NO-LSE2: stlr w8, [x9, #-4]! +; +; GISEL-LSE2-LABEL: store_atomic_i32_offset_4: +; GISEL-LSE2: stlr w8, [x9, #-4]! +; +; SDAG-LSE2-LABEL: store_atomic_i32_offset_4: +; SDAG-LSE2: stlr w8, [x9, #-4]! + %ptr = load ptr, ptr %ptr2ptr + %ptr_wb = getelementptr i32, ptr %ptr, i64 -1 + store atomic i32 1, ptr %ptr_wb release, align 4 + ret void +} + +define dso_local void @store_atomic_i64_offset_8(ptr %ptr2ptr) { +; GISEL-NO-LSE2-LABEL: store_atomic_i64_offset_8: +; GISEL-NO-LSE2: stlr x8, [x9, #-8]! +; +; SDAG-NO-LSE2-LABEL: store_atomic_i64_offset_8: +; SDAG-NO-LSE2: stlr x8, [x9, #-8]! +; +; GISEL-LSE2-LABEL: store_atomic_i64_offset_8: +; GISEL-LSE2: stlr x8, [x9, #-8]! +; +; SDAG-LSE2-LABEL: store_atomic_i64_offset_8: +; SDAG-LSE2: stlr x8, [x9, #-8]! + %ptr = load ptr, ptr %ptr2ptr + %ptr_wb = getelementptr i64, ptr %ptr, i64 -1 + store atomic i64 1, ptr %ptr_wb release, align 8 + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; 3.1.2 Additions to the Advanced SIMD and floating-point ISA - Register variant ; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; LDAPUR , [{, #}] // Zeroing +; LDAPUR , [{, #}] // Zeroing +; LDAPUR , [{, #}] // Zeroing +; LDAPUR
, [{, #}] // Zeroing +; LDAPUR , [{, #}] +; STLUR , [{, #}] +; STLUR , [{, #}] +; STLUR , [{, #}] +; STLUR
, [{, #}] +; STLUR , [{, #}] + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; 3.1.2 Additions to the Advanced SIMD and floating-point ISA - Register Index variant ; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; LDAP1 {.D }[index], [] // Merging +; STL1 {.D }[index], []