Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -61,6 +61,9 @@ def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true", "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">; +def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true", + "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules">; + def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true", "Enable out of line atomics to support LSE instructions">; @@ -459,7 +462,7 @@ "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd, FeatureNV, FeatureMPAM, FeatureDIT, FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI, - FeatureFlagM, FeatureRCPC_IMMO]>; + FeatureFlagM, FeatureRCPC_IMMO, FeatureLSE2]>; def HasV8_5aOps : SubtargetFeature< "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions", Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -660,6 +660,9 @@ void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override; + bool isOpSuitableForLDPSTP(const Instruction *I) const; + bool shouldInsertFencesForAtomic(const Instruction *I) const override; + TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; @@ -863,6 +866,7 @@ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerStore128(SDValue Op, SelectionDAG &DAG) const; SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -785,6 +785,13 @@ setOperationAction(ISD::LOAD, MVT::i128, Custom); setOperationAction(ISD::STORE, MVT::i128, Custom); + // Aligned 128-bit loads and stores are single-copy atomic according to the + // v8.4a spec. + if (Subtarget->hasLSE2()) { + setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); + setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); + } + // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the // custom lowering, as there are no un-paired non-temporal stores and // legalization will break up 256 bit inputs. @@ -4681,18 +4688,7 @@ return Result; } } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { - assert(StoreNode->getValue()->getValueType(0) == MVT::i128); - SDValue Lo = - DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(), - DAG.getConstant(0, Dl, MVT::i64)); - SDValue Hi = - DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(), - DAG.getConstant(1, Dl, MVT::i64)); - SDValue Result = DAG.getMemIntrinsicNode( - AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other), - {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, - StoreNode->getMemoryVT(), StoreNode->getMemOperand()); - return Result; + return LowerStore128(Op, DAG); } else if (MemVT == MVT::i64x8) { SDValue Value = StoreNode->getValue(); assert(Value->getValueType(0) == MVT::i64x8); @@ -4713,6 +4709,31 @@ return SDValue(); } +/// Lower atomic or volatile 128-bit stores to a single STP instruction. +SDValue AArch64TargetLowering::LowerStore128(SDValue Op, + SelectionDAG &DAG) const { + MemSDNode *StoreNode = cast(Op); + assert(StoreNode->getMemoryVT() == MVT::i128); + assert(StoreNode->isVolatile() || StoreNode->isAtomic()); + assert(!StoreNode->isAtomic() || + StoreNode->getMergedOrdering() == AtomicOrdering::Unordered || + StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic); + + SDValue Value = StoreNode->getOpcode() == ISD::STORE + ? StoreNode->getOperand(1) + : StoreNode->getOperand(2); + SDLoc DL(Op); + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value, + DAG.getConstant(0, DL, MVT::i64)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value, + DAG.getConstant(1, DL, MVT::i64)); + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::STP, DL, DAG.getVTList(MVT::Other), + {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, + StoreNode->getMemoryVT(), StoreNode->getMemOperand()); + return Result; +} + SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -4950,6 +4971,12 @@ /*OverrideNEON=*/true); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::ATOMIC_STORE: + if (cast(Op)->getMemoryVT() == MVT::i128) { + assert(Subtarget->hasLSE2()); + return LowerStore128(Op, DAG); + } + return SDValue(); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::MSTORE: @@ -17506,12 +17533,14 @@ case ISD::ATOMIC_CMP_SWAP: ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); return; + case ISD::ATOMIC_LOAD: case ISD::LOAD: { assert(SDValue(N, 0).getValueType() == MVT::i128 && "unexpected load's value type"); - LoadSDNode *LoadNode = cast(N); - if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) { - // Non-volatile loads are optimized later in AArch64's load/store + MemSDNode *LoadNode = cast(N); + if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) || + LoadNode->getMemoryVT() != MVT::i128) { + // Non-volatile or atomic loads are optimized later in AArch64's load/store // optimizer. return; } @@ -17602,12 +17631,37 @@ return TargetLoweringBase::getPreferredVectorAction(VT); } +// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic +// provided the address is 16-byte aligned. +bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const { + if (!Subtarget->hasLSE2()) + return false; + + if (auto LI = dyn_cast(I)) + return LI->getType()->getPrimitiveSizeInBits() == 128 && + LI->getAlignment() >= 16; + + if (auto SI = dyn_cast(I)) + return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && + SI->getAlignment() >= 16; + + return false; +} + +bool AArch64TargetLowering::shouldInsertFencesForAtomic( + const Instruction *I) const { + return isOpSuitableForLDPSTP(I); +} + // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); - return Size == 128; + if (Size != 128) + return false; + + return !isOpSuitableForLDPSTP(SI); } // Loads and stores less than 128-bits are already atomic; ones above that @@ -17616,7 +17670,11 @@ TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; + + if (Size != 128 || isOpSuitableForLDPSTP(LI)) + return AtomicExpansionKind::None; + + return AtomicExpansionKind::LLSC; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, Index: llvm/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -99,6 +99,7 @@ bool HasDotProd = false; bool HasCRC = false; bool HasLSE = false; + bool HasLSE2 = false; bool HasRAS = false; bool HasRDM = false; bool HasPerfMon = false; @@ -375,6 +376,7 @@ bool hasDotProd() const { return HasDotProd; } bool hasCRC() const { return HasCRC; } bool hasLSE() const { return HasLSE; } + bool hasLSE2() const { return HasLSE2; } bool hasRAS() const { return HasRAS; } bool hasRDM() const { return HasRDM; } bool hasSM4() const { return HasSM4; } Index: llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -824,6 +824,8 @@ return isStore ? AArch64::STRSui : AArch64::LDRSui; case 64: return isStore ? AArch64::STRDui : AArch64::LDRDui; + case 128: + return isStore ? AArch64::STRQui : AArch64::LDRQui; } break; } Index: llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -16,6 +16,7 @@ #include "AArch64Subtarget.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineInstr.h" @@ -35,6 +36,7 @@ using namespace LegalizeActions; using namespace LegalizeMutations; using namespace LegalityPredicates; +using namespace MIPatternMatch; AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) : ST(&ST) { @@ -278,6 +280,10 @@ }; getActionDefinitionsBuilder(G_LOAD) + .customIf([=](const LegalityQuery &Query) { + return Query.Types[0] == s128 && + Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; + }) .legalForTypesWithMemDesc({{s8, p0, s8, 8}, {s16, p0, s16, 8}, {s32, p0, s32, 8}, @@ -316,6 +322,10 @@ .scalarizeIf(typeIs(0, v2s16), 0); getActionDefinitionsBuilder(G_STORE) + .customIf([=](const LegalityQuery &Query) { + return Query.Types[0] == s128 && + Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; + }) .legalForTypesWithMemDesc({{s8, p0, s8, 8}, {s16, p0, s8, 8}, // truncstorei8 from s16 {s32, p0, s8, 8}, // truncstorei8 from s32 @@ -1001,6 +1011,20 @@ return true; } +static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset, + MachineRegisterInfo &MRI) { + Base = Root; + Offset = 0; + + Register NewBase; + int64_t NewOffset; + if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) && + isShiftedInt<7, 3>(NewOffset)) { + Base = NewBase; + Offset = NewOffset; + } +} + // FIXME: This should be removed and replaced with the generic bitcast legalize // action. bool AArch64LegalizerInfo::legalizeLoadStore( @@ -1020,6 +1044,36 @@ Register ValReg = MI.getOperand(0).getReg(); const LLT ValTy = MRI.getType(ValReg); + if (ValTy == LLT::scalar(128)) { + assert((*MI.memoperands_begin())->getSuccessOrdering() == + AtomicOrdering::Monotonic || + (*MI.memoperands_begin())->getSuccessOrdering() == + AtomicOrdering::Unordered); + assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2"); + LLT s64 = LLT::scalar(64); + MachineInstrBuilder NewI; + if (MI.getOpcode() == TargetOpcode::G_LOAD) { + NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {}); + MIRBuilder.buildMerge(ValReg, {NewI->getOperand(0), NewI->getOperand(1)}); + } else { + auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0)); + NewI = MIRBuilder.buildInstr( + AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)}); + } + Register Base; + int Offset; + matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); + NewI.addUse(Base); + NewI.addImm(Offset / 8); + + NewI.cloneMemRefs(MI); + constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(), + *MRI.getTargetRegisterInfo(), + *ST->getRegBankInfo()); + MI.eraseFromParent(); + return true; + } + if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || ValTy.getElementType().getAddressSpace() != 0) { LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); Index: llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O1 -; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=apple-a13 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1 +; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1 ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O0 -; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mcpu=apple-a13 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0 +; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0 @var = global i128 0 define void @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) { @@ -411,7 +411,7 @@ ; CHECK-CAS-O0-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-CAS-O0-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload ; CHECK-CAS-O0-NEXT: ldxp x9, x10, [x11] -; CHECK-CAS-O0-NEXT: mov x8, #0 +; CHECK-CAS-O0-NEXT: mov x8, xzr ; CHECK-CAS-O0-NEXT: orr x9, x9, x8 ; CHECK-CAS-O0-NEXT: orr x10, x8, x10 ; CHECK-CAS-O0-NEXT: // implicit-def: $q0 Index: llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll @@ -0,0 +1,212 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+v8.4a %s -o - -global-isel=1 -global-isel-abort=1 | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse2 %s -o - -global-isel=1 -global-isel-abort=1 | FileCheck %s + +define void @test_atomic_load(i128* %addr) { +; CHECK-LABEL: test_atomic_load: + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]] +; CHECK: mov v[[Q]].d[1], [[HI]] +; CHECK: str q[[Q]], [x0] + %res.0 = load atomic i128, i128* %addr monotonic, align 16 + store i128 %res.0, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]] +; CHECK: mov v[[Q]].d[1], [[HI]] +; CHECK: str q[[Q]], [x0] + %res.1 = load atomic i128, i128* %addr unordered, align 16 + store i128 %res.1, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: dmb ish +; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]] +; CHECK: mov v[[Q]].d[1], [[HI]] +; CHECK: str q[[Q]], [x0] + %res.2 = load atomic i128, i128* %addr acquire, align 16 + store i128 %res.2, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: dmb ish +; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]] +; CHECK: mov v[[Q]].d[1], [[HI]] +; CHECK: str q[[Q]], [x0] + %res.3 = load atomic i128, i128* %addr seq_cst, align 16 + store i128 %res.3, i128* %addr + + %addr8 = bitcast i128* %addr to i8* + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #8] +; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]] +; CHECK: mov v[[Q]].d[1], [[HI]] +; CHECK: str q[[Q]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 8 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.5 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.5, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #504] +; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]] +; CHECK: mov v[[Q]].d[1], [[HI]] +; CHECK: str q[[Q]], [x0] + %addr8.2 = getelementptr i8, i8* %addr8, i32 504 + %addr128.2 = bitcast i8* %addr8.2 to i128* + %res.6 = load atomic i128, i128* %addr128.2 monotonic, align 16 + store i128 %res.6, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #-512] +; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]] +; CHECK: mov v[[Q]].d[1], [[HI]] +; CHECK: str q[[Q]], [x0] + %addr8.3 = getelementptr i8, i8* %addr8, i32 -512 + %addr128.3 = bitcast i8* %addr8.3 to i128* + %res.7 = load atomic i128, i128* %addr128.3 monotonic, align 16 + store i128 %res.7, i128* %addr + + ret void +} + +define void @test_libcall_load(i128* %addr) { +; CHECK-LABEL: test_libcall_load: +; CHECK: bl __atomic_load + %res.8 = load atomic i128, i128* %addr unordered, align 8 + store i128 %res.8, i128* %addr + + ret void +} + +define void @test_nonfolded_load1(i128* %addr) { +; CHECK-LABEL: test_nonfolded_load1: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #4 +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]] +; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]] +; CHECK: mov v[[Q]].d[1], [[HI]] +; CHECK: str q[[Q]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 4 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.1, i128* %addr + + ret void +} + +define void @test_nonfolded_load2(i128* %addr) { +; CHECK-LABEL: test_nonfolded_load2: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #512 +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]] +; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]] +; CHECK: mov v[[Q]].d[1], [[HI]] +; CHECK: str q[[Q]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 512 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.1, i128* %addr + + ret void +} + +define void @test_nonfolded_load3(i128* %addr) { +; CHECK-LABEL: test_nonfolded_load3: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: sub x[[ADDR:[0-9]+]], x0, #520 +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]] +; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]] +; CHECK: mov v[[Q]].d[1], [[HI]] +; CHECK: str q[[Q]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 -520 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.1, i128* %addr + + ret void +} + +define void @test_atomic_store(i128* %addr, i128 %val) { +; CHECK-LABEL: test_atomic_store: + +; CHECK: stp x2, x3, [x0] + store atomic i128 %val, i128* %addr monotonic, align 16 + +; CHECK: stp x2, x3, [x0] + store atomic i128 %val, i128* %addr unordered, align 16 + +; CHECK: dmb ish +; CHECK: stp x2, x3, [x0] + store atomic i128 %val, i128* %addr release, align 16 + +; CHECK: dmb ish +; CHECK: stp x2, x3, [x0] +; CHECK: dmb ish + store atomic i128 %val, i128* %addr seq_cst, align 16 + + %addr8 = bitcast i128* %addr to i8* + +; CHECK: stp x2, x3, [x0, #8] + %addr8.1 = getelementptr i8, i8* %addr8, i32 8 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + +; CHECK: stp x2, x3, [x0, #504] + %addr8.2 = getelementptr i8, i8* %addr8, i32 504 + %addr128.2 = bitcast i8* %addr8.2 to i128* + store atomic i128 %val, i128* %addr128.2 monotonic, align 16 + +; CHECK: stp x2, x3, [x0, #-512] + %addr8.3 = getelementptr i8, i8* %addr8, i32 -512 + %addr128.3 = bitcast i8* %addr8.3 to i128* + store atomic i128 %val, i128* %addr128.3 monotonic, align 16 + + ret void +} + +define void @test_libcall_store(i128* %addr, i128 %val) { +; CHECK-LABEL: test_libcall_store: +; CHECK: bl __atomic_store + store atomic i128 %val, i128* %addr unordered, align 8 + + ret void +} + +define void @test_nonfolded_store1(i128* %addr, i128 %val) { +; CHECK-LABEL: test_nonfolded_store1: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #4 +; CHECK: stp x2, x3, [x[[ADDR]]] + %addr8.1 = getelementptr i8, i8* %addr8, i32 4 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + + ret void +} + +define void @test_nonfolded_store2(i128* %addr, i128 %val) { +; CHECK-LABEL: test_nonfolded_store2: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #512 +; CHECK: stp x2, x3, [x[[ADDR]]] + %addr8.1 = getelementptr i8, i8* %addr8, i32 512 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + + ret void +} + +define void @test_nonfolded_store3(i128* %addr, i128 %val) { +; CHECK-LABEL: test_nonfolded_store3: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: sub x[[ADDR:[0-9]+]], x0, #520 +; CHECK: stp x2, x3, [x[[ADDR]]] + %addr8.1 = getelementptr i8, i8* %addr8, i32 -520 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + + ret void +} Index: llvm/test/CodeGen/AArch64/atomic-ops-lse.ll =================================================================== --- llvm/test/CodeGen/AArch64/atomic-ops-lse.ll +++ llvm/test/CodeGen/AArch64/atomic-ops-lse.ll @@ -3,7 +3,7 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics < %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS ; RUN: llc -mtriple=aarch64_be-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s --check-prefix=CHECK-REG -; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira -mattr=-lse2 < %s | FileCheck %s ; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created ; (i.e. reusing a register for status & data in store exclusive). Index: llvm/test/CodeGen/AArch64/v8.4-atomic-128.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/v8.4-atomic-128.ll @@ -0,0 +1,194 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+v8.4a %s -o - | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse2 %s -o - | FileCheck %s + +define void @test_atomic_load(i128* %addr) { +; CHECK-LABEL: test_atomic_load: + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: stp [[LO]], [[HI]], [x0] + %res.0 = load atomic i128, i128* %addr monotonic, align 16 + store i128 %res.0, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: stp [[LO]], [[HI]], [x0] + %res.1 = load atomic i128, i128* %addr unordered, align 16 + store i128 %res.1, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: dmb ish +; CHECK: stp [[LO]], [[HI]], [x0] + %res.2 = load atomic i128, i128* %addr acquire, align 16 + store i128 %res.2, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: dmb ish +; CHECK: stp [[LO]], [[HI]], [x0] + %res.3 = load atomic i128, i128* %addr seq_cst, align 16 + store i128 %res.3, i128* %addr + + + %addr8 = bitcast i128* %addr to i8* + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #32] +; CHECK-DAG: stp [[LO]], [[HI]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 32 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.5 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.5, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #504] +; CHECK: stp [[LO]], [[HI]], [x0] + %addr8.2 = getelementptr i8, i8* %addr8, i32 504 + %addr128.2 = bitcast i8* %addr8.2 to i128* + %res.6 = load atomic i128, i128* %addr128.2 monotonic, align 16 + store i128 %res.6, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #-512] +; CHECK: stp [[LO]], [[HI]], [x0] + %addr8.3 = getelementptr i8, i8* %addr8, i32 -512 + %addr128.3 = bitcast i8* %addr8.3 to i128* + %res.7 = load atomic i128, i128* %addr128.3 monotonic, align 16 + store i128 %res.7, i128* %addr + + ret void +} + +define void @test_libcall_load(i128* %addr) { +; CHECK-LABEL: test_libcall_load: +; CHECK: bl __atomic_load + %res.8 = load atomic i128, i128* %addr unordered, align 8 + store i128 %res.8, i128* %addr + + ret void +} + +define void @test_nonfolded_load1(i128* %addr) { +; CHECK-LABEL: test_nonfolded_load1: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #4 +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]] +; CHECK: stp [[LO]], [[HI]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 4 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.1, i128* %addr + + ret void +} + +define void @test_nonfolded_load2(i128* %addr) { +; CHECK-LABEL: test_nonfolded_load2: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #512 +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]] +; CHECK: stp [[LO]], [[HI]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 512 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.1, i128* %addr + + ret void +} + +define void @test_nonfolded_load3(i128* %addr) { +; CHECK-LABEL: test_nonfolded_load3: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: sub x[[ADDR:[0-9]+]], x0, #520 +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]] +; CHECK: stp [[LO]], [[HI]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 -520 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.1, i128* %addr + + ret void +} + +define void @test_atomic_store(i128* %addr, i128 %val) { +; CHECK-LABEL: test_atomic_store: + +; CHECK: stp x2, x3, [x0] + store atomic i128 %val, i128* %addr monotonic, align 16 + +; CHECK: stp x2, x3, [x0] + store atomic i128 %val, i128* %addr unordered, align 16 + +; CHECK: dmb ish +; CHECK: stp x2, x3, [x0] + store atomic i128 %val, i128* %addr release, align 16 + +; CHECK: dmb ish +; CHECK: stp x2, x3, [x0] +; CHECK: dmb ish + store atomic i128 %val, i128* %addr seq_cst, align 16 + + + %addr8 = bitcast i128* %addr to i8* + +; CHECK: stp x2, x3, [x0, #8] + %addr8.1 = getelementptr i8, i8* %addr8, i32 8 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + +; CHECK: stp x2, x3, [x0, #504] + %addr8.2 = getelementptr i8, i8* %addr8, i32 504 + %addr128.2 = bitcast i8* %addr8.2 to i128* + store atomic i128 %val, i128* %addr128.2 monotonic, align 16 + +; CHECK: stp x2, x3, [x0, #-512] + %addr8.3 = getelementptr i8, i8* %addr8, i32 -512 + %addr128.3 = bitcast i8* %addr8.3 to i128* + store atomic i128 %val, i128* %addr128.3 monotonic, align 16 + + ret void +} + +define void @test_libcall_store(i128* %addr, i128 %val) { +; CHECK-LABEL: test_libcall_store: +; CHECK: bl __atomic_store + store atomic i128 %val, i128* %addr unordered, align 8 + + ret void +} + +define void @test_nonfolded_store1(i128* %addr, i128 %val) { +; CHECK-LABEL: test_nonfolded_store1: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #4 +; CHECK: stp x2, x3, [x[[ADDR]]] + %addr8.1 = getelementptr i8, i8* %addr8, i32 4 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + + ret void +} + +define void @test_nonfolded_store2(i128* %addr, i128 %val) { +; CHECK-LABEL: test_nonfolded_store2: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #512 +; CHECK: stp x2, x3, [x[[ADDR]]] + %addr8.1 = getelementptr i8, i8* %addr8, i32 512 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + + ret void +} + +define void @test_nonfolded_store3(i128* %addr, i128 %val) { +; CHECK-LABEL: test_nonfolded_store3: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: sub x[[ADDR:[0-9]+]], x0, #520 +; CHECK: stp x2, x3, [x[[ADDR]]] + %addr8.1 = getelementptr i8, i8* %addr8, i32 -520 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + + ret void +}