diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -360,6 +360,12 @@ "true", "Use an instruction sequence for taking the address of a global " "that allows a memory tag in the upper address bits">; +// In v8.4a onwards, aligned ldp and stp operations are atomic. This also applies +// to some specific earlier CPUs. +def FeatureAtomicLDPSTP : SubtargetFeature<"atomic-ldp-stp", "HasAtomicLDPSTP", "true", + "Use LDP & STP for aligned 128-bit atomics">; + + //===----------------------------------------------------------------------===// // Architectures. // @@ -380,7 +386,7 @@ "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd, FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT, FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI, - FeatureFMI, FeatureRCPC_IMMO]>; + FeatureFMI, FeatureRCPC_IMMO, FeatureAtomicLDPSTP]>; def HasV8_5aOps : SubtargetFeature< "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions", @@ -573,7 +579,8 @@ FeaturePerfMon, FeatureZCRegMove, FeatureZCZeroing, - FeatureZCZeroingFPWorkaround + FeatureZCZeroingFPWorkaround, + FeatureAtomicLDPSTP ]>; def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -613,6 +613,12 @@ MI.eraseFromParent(); return true; } + case AArch64::LOAD_ATOMIC_128: + MI.setDesc(TII->get(AArch64::LDPXi)); + break; + case AArch64::STORE_ATOMIC_128: + MI.setDesc(TII->get(AArch64::STPXi)); + break; case AArch64::CMP_SWAP_8: return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB, AArch64::SUBSWrx, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -408,6 +408,8 @@ void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; + bool isOpSuitableForLDPSTP(const Instruction *I) const; + bool shouldInsertFencesForAtomic(const Instruction *I) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -602,9 +602,20 @@ setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::CONCAT_VECTORS); + + // Volatile 128-bit loads and stores can have useful extra properties if + // implemented as a single LDP or STP. Unfortunately i128 is not a legal type + // so the only opportunity we have to do anything with them is the very first + // DAG combine. setTargetDAGCombine(ISD::STORE); - if (Subtarget->supportsAddressTopByteIgnored()) - setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::LOAD); + + // Similarly atomic 128-bit operations are sometimes implementable with LDP or + // STP. + if (Subtarget->hasAtomicLDPSTP()) { + setTargetDAGCombine(ISD::ATOMIC_LOAD); + setTargetDAGCombine(ISD::ATOMIC_STORE); + } setTargetDAGCombine(ISD::MUL); @@ -11491,6 +11502,85 @@ DAG.getConstant(MinOffset, DL, MVT::i64)); } +static void matchLDPSTPAddrMode(SDValue Addr, SDValue &Base, SDValue &Offset, + SelectionDAG &DAG) { + SDLoc DL(Addr); + Base = Addr; + Offset = DAG.getTargetConstant(0, DL, MVT::i32); + + if (Addr.getOpcode() != ISD::ADD || !isa(Addr->getOperand(1))) + return; + + int64_t Val = cast(Base->getOperand(1))->getSExtValue(); + if (Val % 8 != 0 || !isInt<7>(Val / 8)) + return; + + Base = Base->getOperand(0); + Offset = DAG.getTargetConstant(Val / 8, DL, MVT::i32); +} + +static SDValue performAtomic128Load(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + MemSDNode *MN = cast(N); + assert((N->getOpcode() != ISD::ATOMIC_LOAD || MN->getAlignment() >= 16) && + "ldp only atomic for aligned addresses"); + assert(MN->getMemoryVT() == MVT::i128 && "Wrong size for load"); + assert((N->getOpcode() != ISD::LOAD || + cast(N)->getExtensionType() == ISD::NON_EXTLOAD) && + "unexpected extend"); + + if (Subtarget->requiresStrictAlign() && MN->getAlignment() < 16) + return SDValue(); + + SDLoc DL(N); + SDValue Base, Offset; + matchLDPSTPAddrMode(MN->getBasePtr(), Base, Offset, DAG); + + SDNode *NewLoad = DAG.getMachineNode(AArch64::LOAD_ATOMIC_128, DL, MVT::i64, + MVT::i64, MVT::Other, Base, Offset, + N->getOperand(0)); + DAG.setNodeMemRefs(cast(NewLoad), MN->getMemOperand()); + + SDValue NewVal = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, + SDValue(NewLoad, 0), SDValue(NewLoad, 1)); + + DCI.CombineTo(N, {NewVal, SDValue(NewLoad, 2)}); + return SDValue(N, 0); +} + +static SDValue performAtomic128Store(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + MemSDNode *MN = cast(N); + assert((N->getOpcode() != ISD::ATOMIC_STORE || MN->getAlignment() >= 16) && + "stp only atomic for aligned addresses"); + assert(MN->getMemoryVT() == MVT::i128 && "Wrong size for store"); + assert((N->getOpcode() != ISD::STORE || + !cast(N)->isTruncatingStore()) && + "unexpected trunace"); + + if (Subtarget->requiresStrictAlign() && MN->getAlignment() < 16) + return SDValue(); + + SDValue Val = N->getOperand(N->getOpcode() == ISD::STORE ? 1 : 2); + SDLoc DL(N); + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Val, + DAG.getIntPtrConstant(0, DL)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Val, + DAG.getIntPtrConstant(1, DL)); + + SDValue Base, Offset; + matchLDPSTPAddrMode(MN->getBasePtr(), Base, Offset, DAG); + + SDValue Ops[] = {Lo, Hi, Base, Offset, N->getOperand(0)}; + SDNode *NewStore = + DAG.getMachineNode(AArch64::STORE_ATOMIC_128, DL, MVT::Other, Ops); + DAG.setNodeMemRefs(cast(NewStore), MN->getMemOperand()); + + return SDValue(NewStore, 0); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -11533,12 +11623,30 @@ return performSelectCombine(N, DCI); case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); - case ISD::LOAD: - if (performTBISimplification(N->getOperand(1), DCI, DAG)) + case ISD::LOAD: { + MemSDNode *MN = cast(N); + if (MN->isVolatile() && MN->getMemoryVT() == MVT::i128) + return performAtomic128Load(N, DAG, DCI, Subtarget); + if (Subtarget->supportsAddressTopByteIgnored() && + performTBISimplification(N->getOperand(1), DCI, DAG)) return SDValue(N, 0); break; - case ISD::STORE: + } + case ISD::ATOMIC_LOAD: + if (Subtarget->hasAtomicLDPSTP() && N->getValueType(0) == MVT::i128) + return performAtomic128Load(N, DAG, DCI, Subtarget); + break; + case ISD::ATOMIC_STORE: + if (Subtarget->hasAtomicLDPSTP() && + N->getOperand(2)->getValueType(0) == MVT::i128) + return performAtomic128Store(N, DAG, DCI, Subtarget); + break; + case ISD::STORE: { + MemSDNode *MN = cast(N); + if (MN->isVolatile() && MN->getMemoryVT() == MVT::i128) + return performAtomic128Store(N, DAG, DCI, Subtarget); return performSTORECombine(N, DCI, DAG, Subtarget); + } case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: @@ -11898,12 +12006,39 @@ return TargetLoweringBase::getPreferredVectorAction(VT); } +// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic +// provided the address is 16-byte aligned. +bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const { + if (!Subtarget->hasAtomicLDPSTP()) + return false; + + if (auto LI = dyn_cast(I)) + return LI->getType()->getPrimitiveSizeInBits() == 128 && + LI->getAlignment() >= 16; + else if (auto SI = dyn_cast(I)) + return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && + SI->getAlignment() >= 16; + + return false; +} + +bool AArch64TargetLowering::shouldInsertFencesForAtomic( + const Instruction *I) const { + // There is no LDP or STP with acquire/release semantics so we'll need a + // barrier. + return isOpSuitableForLDPSTP(I); +} + + // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); - return Size == 128; + if (Size != 128) + return false; + + return !isOpSuitableForLDPSTP(SI); } // Loads and stores less than 128-bits are already atomic; ones above that @@ -11912,7 +12047,11 @@ TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; + + if (Size != 128 || isOpSuitableForLDPSTP(LI)) + return AtomicExpansionKind::None; + + return AtomicExpansionKind::LLSC; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -454,3 +454,13 @@ defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">; defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">; } + +let mayLoad = 1 in +def LOAD_ATOMIC_128 : Pseudo<(outs GPR64:$Rt1, GPR64:$Rt2), + (ins GPR64:$Rn, simm7s8:$offs), []>, + Sched<[]>; + +let mayStore = 1 in +def STORE_ATOMIC_128 : Pseudo<(outs), (ins GPR64:$Rt1, GPR64:$Rt2, GPR64:$Rn, + simm7s8:$offs), []>, + Sched<[]>; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -157,6 +157,7 @@ bool HasZeroCycleZeroingGP = false; bool HasZeroCycleZeroingFP = false; bool HasZeroCycleZeroingFPWorkaround = false; + bool HasAtomicLDPSTP = false; // StrictAlign - Disallow unaligned memory accesses. bool StrictAlign = false; @@ -436,6 +437,7 @@ bool hasTLB_RMI() const { return HasTLB_RMI; } bool hasFMI() const { return HasFMI; } bool hasRCPC_IMMO() const { return HasRCPC_IMMO; } + bool hasAtomicLDPSTP() const { return HasAtomicLDPSTP; } bool useSmallAddressing() const { switch (TLInfo.getTargetMachine().getCodeModel()) { diff --git a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll --- a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll +++ b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-ATOMIC-LDP @var = global i128 0 @@ -169,6 +170,10 @@ ; CHECK-NOT: dmb ; CHECK-LABEL: ldaxp ; CHECK-NOT: dmb + +; CHECK-ATOMIC-LDP-LABEL: atomic_load_seq_cst: +; CHECK-ATOMIC-LDP: ldp x0, x1, [x0] +; CHECK-ATOMIC-LDP: dmb ish %r = load atomic i128, i128* %p seq_cst, align 16 ret i128 %r } @@ -181,6 +186,10 @@ ; CHECK-NEXT: stxp [[SUCCESS:w[0-9]+]], [[LO]], [[HI]], [x2] ; CHECK: cbnz [[SUCCESS]], [[LABEL]] ; CHECK-NOT: dmb + +; CHECK-ATOMIC-LDP-LABEL: atomic_load_relaxed: +; CHECK-ATOMIC-LDP: ldp x0, x1, [x2] +; CHECK-ATOMIC-LDP-NOT: dmb %r = load atomic i128, i128* %p monotonic, align 16 ret i128 %r } @@ -194,6 +203,12 @@ ; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2] ; CHECK: cbnz [[SUCCESS]], [[LABEL]] ; CHECK-NOT: dmb + +; CHECK-ATOMIC-LDP-LABEL: atomic_store_seq_cst: +; CHECK-ATOMIC-LDP: dmb ish +; CHECK-ATOMIC-LDP: stp x0, x1, [x2] +; CHECK-ATOMIC-LDP: dmb ish + store atomic i128 %in, i128* %p seq_cst, align 16 ret void } @@ -206,6 +221,12 @@ ; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2] ; CHECK: cbnz [[SUCCESS]], [[LABEL]] ; CHECK-NOT: dmb + +; CHECK-ATOMIC-LDP-LABEL: atomic_store_release: +; CHECK-ATOMIC-LDP: dmb ish +; CHECK-ATOMIC-LDP: stp x0, x1, [x2] +; CHECK-ATOMIC-LDP-NOT: dmb + store atomic i128 %in, i128* %p release, align 16 ret void } @@ -218,6 +239,12 @@ ; CHECK: stxp [[SUCCESS:w[0-9]+]], x0, x1, [x2] ; CHECK: cbnz [[SUCCESS]], [[LABEL]] ; CHECK-NOT: dmb + +; CHECK-ATOMIC-LDP-LABEL: atomic_store_relaxed: +; CHECK-ATOMIC-LDP-NOT: dmb +; CHECK-ATOMIC-LDP: stp x0, x1, [x2] +; CHECK-ATOMIC-LDP-NOT: dmb + store atomic i128 %in, i128* %p unordered, align 16 ret void } diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll b/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll --- a/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll +++ b/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll @@ -87,10 +87,8 @@ define {i128, i1} @test_cmpxchg_128_unsplit(i128* %addr) { ; CHECK-LABEL: test_cmpxchg_128_unsplit: ; CHECK: add x[[VAR128:[0-9]+]], {{x[0-9]+}}, :lo12:var128 -; CHECK: ldr [[DESIRED_HI:x[0-9]+]], [x[[VAR128]], #8] -; CHECK: ldr [[DESIRED_LO:x[0-9]+]], [x[[VAR128]]] -; CHECK: ldr [[NEW_HI:x[0-9]+]], [x[[VAR128]], #8] -; CHECK: ldr [[NEW_LO:x[0-9]+]], [x[[VAR128]]] +; CHECK: ldp [[DESIRED_LO:x[0-9]+]], [[DESIRED_HI:x[0-9]+]], [x[[VAR128]]] +; CHECK: ldp [[NEW_LO:x[0-9]+]], [[NEW_HI:x[0-9]+]], [x[[VAR128]]] ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: ; CHECK: ldaxp [[OLD_LO:x[0-9]+]], [[OLD_HI:x[0-9]+]], [x0] ; CHECK: cmp [[OLD_LO]], [[DESIRED_LO]] diff --git a/llvm/test/CodeGen/AArch64/v8.4-atomic-128.ll b/llvm/test/CodeGen/AArch64/v8.4-atomic-128.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/v8.4-atomic-128.ll @@ -0,0 +1,221 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+v8.4a %s -o - | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cyclone %s -o - | FileCheck %s + +define void @test_atomic_load(i128* %addr) { +; CHECK-LABEL: test_atomic_load: + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: stp [[LO]], [[HI]], [x0] + %res.0 = load atomic i128, i128* %addr monotonic, align 16 + store i128 %res.0, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: stp [[LO]], [[HI]], [x0] + %res.1 = load atomic i128, i128* %addr unordered, align 16 + store i128 %res.1, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: dmb ish +; CHECK: stp [[LO]], [[HI]], [x0] + %res.2 = load atomic i128, i128* %addr acquire, align 16 + store i128 %res.2, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: dmb ish +; CHECK: stp [[LO]], [[HI]], [x0] + %res.3 = load atomic i128, i128* %addr seq_cst, align 16 + store i128 %res.3, i128* %addr + + + %addr8 = bitcast i128* %addr to i8* + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #8] +; CHECK-DAG: stp [[LO]], [[HI]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 8 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.5 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.5, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #504] +; CHECK: stp [[LO]], [[HI]], [x0] + %addr8.2 = getelementptr i8, i8* %addr8, i32 504 + %addr128.2 = bitcast i8* %addr8.2 to i128* + %res.6 = load atomic i128, i128* %addr128.2 monotonic, align 16 + store i128 %res.6, i128* %addr + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #-512] +; CHECK: stp [[LO]], [[HI]], [x0] + %addr8.3 = getelementptr i8, i8* %addr8, i32 -512 + %addr128.3 = bitcast i8* %addr8.3 to i128* + %res.7 = load atomic i128, i128* %addr128.3 monotonic, align 16 + store i128 %res.7, i128* %addr + + ret void +} + +define void @test_libcall_load(i128* %addr) { +; CHECK-LABEL: test_libcall_load: +; CHECK: bl __atomic_load + %res.8 = load atomic i128, i128* %addr unordered, align 8 + store i128 %res.8, i128* %addr + + ret void +} + +define void @test_nonfolded_load1(i128* %addr) { +; CHECK-LABEL: test_nonfolded_load1: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #4 +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]] +; CHECK: stp [[LO]], [[HI]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 4 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.1, i128* %addr + + ret void +} + +define void @test_nonfolded_load2(i128* %addr) { +; CHECK-LABEL: test_nonfolded_load2: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #512 +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]] +; CHECK: stp [[LO]], [[HI]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 512 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.1, i128* %addr + + ret void +} + +define void @test_nonfolded_load3(i128* %addr) { +; CHECK-LABEL: test_nonfolded_load3: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: sub x[[ADDR:[0-9]+]], x0, #520 +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]] +; CHECK: stp [[LO]], [[HI]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 -520 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16 + store i128 %res.1, i128* %addr + + ret void +} + +define void @test_atomic_store(i128* %addr, i128 %val) { +; CHECK-LABEL: test_atomic_store: + +; CHECK: stp x2, x3, [x0] + store atomic i128 %val, i128* %addr monotonic, align 16 + +; CHECK: stp x2, x3, [x0] + store atomic i128 %val, i128* %addr unordered, align 16 + +; CHECK: dmb ish +; CHECK: stp x2, x3, [x0] + store atomic i128 %val, i128* %addr release, align 16 + +; CHECK: dmb ish +; CHECK: stp x2, x3, [x0] +; CHECK: dmb ish + store atomic i128 %val, i128* %addr seq_cst, align 16 + + + %addr8 = bitcast i128* %addr to i8* + +; CHECK: stp x2, x3, [x0, #8] + %addr8.1 = getelementptr i8, i8* %addr8, i32 8 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + +; CHECK: stp x2, x3, [x0, #504] + %addr8.2 = getelementptr i8, i8* %addr8, i32 504 + %addr128.2 = bitcast i8* %addr8.2 to i128* + store atomic i128 %val, i128* %addr128.2 monotonic, align 16 + +; CHECK: stp x2, x3, [x0, #-512] + %addr8.3 = getelementptr i8, i8* %addr8, i32 -512 + %addr128.3 = bitcast i8* %addr8.3 to i128* + store atomic i128 %val, i128* %addr128.3 monotonic, align 16 + + ret void +} + +define void @test_libcall_store(i128* %addr, i128 %val) { +; CHECK-LABEL: test_libcall_store: +; CHECK: bl __atomic_store + store atomic i128 %val, i128* %addr unordered, align 8 + + ret void +} + +define void @test_nonfolded_store1(i128* %addr, i128 %val) { +; CHECK-LABEL: test_nonfolded_store1: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #4 +; CHECK: stp x2, x3, [x[[ADDR]]] + %addr8.1 = getelementptr i8, i8* %addr8, i32 4 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + + ret void +} + +define void @test_nonfolded_store2(i128* %addr, i128 %val) { +; CHECK-LABEL: test_nonfolded_store2: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: add x[[ADDR:[0-9]+]], x0, #512 +; CHECK: stp x2, x3, [x[[ADDR]]] + %addr8.1 = getelementptr i8, i8* %addr8, i32 512 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + + ret void +} + +define void @test_nonfolded_store3(i128* %addr, i128 %val) { +; CHECK-LABEL: test_nonfolded_store3: + %addr8 = bitcast i128* %addr to i8* + +; CHECK: sub x[[ADDR:[0-9]+]], x0, #520 +; CHECK: stp x2, x3, [x[[ADDR]]] + %addr8.1 = getelementptr i8, i8* %addr8, i32 -520 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store atomic i128 %val, i128* %addr128.1 monotonic, align 16 + + ret void +} + +define i128 @test_volatile_atomic_load(i128* %addr) { +; CHECK-LABEL: test_volatile_atomic_load: + +; CHECK: ldp [[LHS_LO:x[0-9]+]], [[LHS_HI:x[0-9]+]], [x0] + %lhs = load atomic volatile i128, i128* %addr monotonic, align 16 + +; CHECK: bl __atomic_load +; CHECK: ldp [[RHS_LO:x[0-9]+]], [[RHS_HI:x[0-9]+]], [sp] + %rhs = load atomic volatile i128, i128* %addr monotonic, align 1 + +; CHECK: adds x0, [[LHS_LO]], [[RHS_LO]] +; CHECK: adcs x1, [[LHS_HI]], [[RHS_HI]] + %res = add i128 %lhs, %rhs + ret i128 %res +} + +define void @test_volatile_atomic_store(i128* %addr, i128 %val) { +; CHECK-LABEL: test_volatile_atomic_store: + +; CHECK: stp x2, x3, [x0] + store atomic volatile i128 %val, i128* %addr monotonic, align 16 + +; CHECK: bl __atomic_store + store atomic volatile i128 %val, i128* %addr monotonic, align 1 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/volatile-128.ll b/llvm/test/CodeGen/AArch64/volatile-128.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/volatile-128.ll @@ -0,0 +1,68 @@ +; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+strict-align %s -o - | FileCheck %s --check-prefix=CHECK-STRICT-ALIGN + +define void @test_volatile_load(i128* %addr) { +; CHECK-LABEL: test_volatile_load: + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: stp [[LO]], [[HI]], [x0] + %res.0 = load volatile i128, i128* %addr, align 16 + store volatile i128 %res.0, i128* %addr + + %addr8 = bitcast i128* %addr to i8* + +; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #8] +; CHECK-DAG: stp [[LO]], [[HI]], [x0] + %addr8.1 = getelementptr i8, i8* %addr8, i32 8 + %addr128.1 = bitcast i8* %addr8.1 to i128* + %res.5 = load volatile i128, i128* %addr128.1, align 16 + store volatile i128 %res.5, i128* %addr + +; CHECK: ldp +; CHECK-STRICT-ALIGN-LABEL: test_volatile_load: +; CHECK-STRICT-ALIGN: ldrb + load volatile i128, i128* %addr, align 1 + + ret void +} + +define void @test_volatile_store(i128* %addr, i128 %val) { +; CHECK-LABEL: test_volatile_store: + +; CHECK: stp x2, x3, [x0] + store volatile i128 %val, i128* %addr, align 16 + + %addr8 = bitcast i128* %addr to i8* + +; CHECK: stp x2, x3, [x0, #8] + %addr8.1 = getelementptr i8, i8* %addr8, i32 8 + %addr128.1 = bitcast i8* %addr8.1 to i128* + store volatile i128 %val, i128* %addr128.1, align 16 + +; CHECK: stp +; CHECK-STRICT-ALIGN-LABEL: test_volatile_store: +; CHECK-STRICT-ALIGN: strb + store volatile i128 %val, i128* %addr, align 1 + + ret void +} + +define i256 @test_ext(i128* %addr) { +; CHECK-LABEL: test_ext: +; CHECK: ldp x0, x1, [x0] +; CHECK: mov x2, xzr +; CHECK: mov x3, xzr + + %res.128 = load volatile i128, i128* %addr + %res = zext i128 %res.128 to i256 + ret i256 %res +} + +define void @test_trunc(i128* %addr, i256 %val) { +; CHECK-LABEL: test_trunc: +; CHECK: stp x2, x3, [x0] + + %val.128 = trunc i256 %val to i128 + store i128 %val.128, i128* %addr + ret void +}