Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -660,6 +660,9 @@ void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override; + bool isOpSuitableForLDPSTP(const Instruction *I) const; + bool shouldInsertFencesForAtomic(const Instruction *I) const override; + TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -910,6 +910,14 @@ if (Subtarget->supportsAddressTopByteIgnored()) setTargetDAGCombine(ISD::LOAD); + // Aligned 128-bit loads and stores are single-copy atomic according to the + // v8.4a spec. Unfortunately i128 is not a legal type so the only opportunity + // we have to do anything with them is the very first DAG combine. + if (Subtarget->hasV8_4aOps()) { + setTargetDAGCombine(ISD::ATOMIC_LOAD); + setTargetDAGCombine(ISD::ATOMIC_STORE); + } + setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); @@ -16291,6 +16299,64 @@ DAG.getConstant(MinOffset, DL, MVT::i64)); } +static void matchLDPSTPAddrMode(SDValue Addr, SDValue &Base, SDValue &Offset, + SelectionDAG &DAG) { + SDLoc DL(Addr); + Base = Addr; + Offset = DAG.getTargetConstant(0, DL, MVT::i32); + + if (Addr.getOpcode() != ISD::ADD || !isa(Addr->getOperand(1))) + return; + + int64_t Val = cast(Base->getOperand(1))->getSExtValue(); + if (Val % 8 != 0 || !isInt<7>(Val / 8)) + return; + + Base = Base->getOperand(0); + Offset = DAG.getTargetConstant(Val / 8, DL, MVT::i32); +} + +static SDValue performAtomic128Load(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + MemSDNode *MN = cast(N); + assert(MN->getAlignment() >= 16 && "ldp only works for aligned addresses"); + + SDLoc DL(N); + SDValue Base, Offset; + matchLDPSTPAddrMode(MN->getBasePtr(), Base, Offset, DAG); + + SDNode *NewLoad = + DAG.getMachineNode(AArch64::LDPXi, DL, MVT::i64, MVT::i64, MVT::Other, + Base, Offset, N->getOperand(0)); + DAG.setNodeMemRefs(cast(NewLoad), MN->getMemOperand()); + + SDValue NewVal = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, + SDValue(NewLoad, 0), SDValue(NewLoad, 1)); + + DCI.CombineTo(N, {NewVal, SDValue(NewLoad, 2)}); + return SDValue(N, 0); +} + +static SDValue performAtomic128Store(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + MemSDNode *MN = cast(N); + assert(MN->getAlignment() >= 16 && "ldp only works for aligned addresses"); + + SDLoc DL(N); + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, N->getOperand(2), + DAG.getIntPtrConstant(0, DL)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, N->getOperand(2), + DAG.getIntPtrConstant(1, DL)); + + SDValue Base, Offset; + matchLDPSTPAddrMode(MN->getBasePtr(), Base, Offset, DAG); + + SDValue Ops[] = {Lo, Hi, Base, Offset, N->getOperand(0)}; + SDNode *NewStore = DAG.getMachineNode(AArch64::STPXi, DL, MVT::Other, Ops); + DAG.setNodeMemRefs(cast(NewStore), MN->getMemOperand()); + + return SDValue(NewStore, 0); +} // Turns the vector of indices into a vector of byte offstes by scaling Offset // by (BitWidth / 8). static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, @@ -16857,6 +16923,14 @@ return performVSelectCombine(N, DCI.DAG); case ISD::SETCC: return performSETCCCombine(N, DAG); + case ISD::ATOMIC_LOAD: + if (N->getValueType(0) == MVT::i128) + return performAtomic128Load(N, DAG, DCI); + break; + case ISD::ATOMIC_STORE: + if (N->getOperand(2)->getValueType(0) == MVT::i128) + return performAtomic128Store(N, DAG, DCI); + break; case ISD::LOAD: if (performTBISimplification(N->getOperand(1), DCI, DAG)) return SDValue(N, 0); @@ -17392,6 +17466,7 @@ unsigned Opcode; switch (MemOp->getMergedOrdering()) { case AtomicOrdering::Monotonic: + case AtomicOrdering::Unordered: Opcode = AArch64::CASPX; break; case AtomicOrdering::Acquire: @@ -17602,12 +17677,36 @@ return TargetLoweringBase::getPreferredVectorAction(VT); } +// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic +// provided the address is 16-byte aligned. +bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const { + if (!Subtarget->hasV8_4aOps()) + return false; + + if (auto LI = dyn_cast(I)) + return LI->getType()->getPrimitiveSizeInBits() == 128 && + LI->getAlignment() >= 16; + else if (auto SI = dyn_cast(I)) + return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && + SI->getAlignment() >= 16; + + return false; +} + +bool AArch64TargetLowering::shouldInsertFencesForAtomic( + const Instruction *I) const { + return isOpSuitableForLDPSTP(I); +} + // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); - return Size == 128; + if (Size != 128) + return false; + + return !isOpSuitableForLDPSTP(SI); } // Loads and stores less than 128-bits are already atomic; ones above that @@ -17616,7 +17715,11 @@ TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; + + if (Size != 128 || isOpSuitableForLDPSTP(LI)) + return AtomicExpansionKind::None; + + return AtomicExpansionKind::LLSC; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, Index: llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -824,6 +824,8 @@ return isStore ? AArch64::STRSui : AArch64::LDRSui; case 64: return isStore ? AArch64::STRDui : AArch64::LDRDui; + case 128: + return isStore ? AArch64::STRQui : AArch64::LDRQui; } break; } Index: llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -278,6 +278,10 @@ }; getActionDefinitionsBuilder(G_LOAD) + .customIf([=](const LegalityQuery &Query) { + return Query.Types[0] == s128 && + Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; + }) .legalForTypesWithMemDesc({{s8, p0, s8, 8}, {s16, p0, s16, 8}, {s32, p0, s32, 8}, @@ -316,6 +320,10 @@ .scalarizeIf(typeIs(0, v2s16), 0); getActionDefinitionsBuilder(G_STORE) + .customIf([=](const LegalityQuery &Query) { + return Query.Types[0] == s128 && + Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; + }) .legalForTypesWithMemDesc({{s8, p0, s8, 8}, {s16, p0, s8, 8}, // truncstorei8 from s16 {s32, p0, s8, 8}, // truncstorei8 from s32 @@ -1001,6 +1009,33 @@ return true; } +static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset, + MachineRegisterInfo &MRI) { + Base = Root; + Offset = 0; + + MachineInstr *RootDef = MRI.getVRegDef(Root); + if (!RootDef || RootDef->getOpcode() != TargetOpcode::G_PTR_ADD) + return; + + MachineOperand &OffImm = RootDef->getOperand(2); + if (!OffImm.isReg()) + return; + MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); + if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT) + return; + int64_t RHSC; + MachineOperand &RHSOp1 = RHS->getOperand(1); + if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) + return; + + RHSC = RHSOp1.getCImm()->getSExtValue(); + if (isShiftedInt<7, 3>(RHSC)) { + Base = RootDef->getOperand(1).getReg(); + Offset = RHSC; + } +} + // FIXME: This should be removed and replaced with the generic bitcast legalize // action. bool AArch64LegalizerInfo::legalizeLoadStore( @@ -1020,6 +1055,36 @@ Register ValReg = MI.getOperand(0).getReg(); const LLT ValTy = MRI.getType(ValReg); + if (ValTy == LLT::scalar(128)) { + assert((*MI.memoperands_begin())->getSuccessOrdering() == + AtomicOrdering::Monotonic || + (*MI.memoperands_begin())->getSuccessOrdering() == + AtomicOrdering::Unordered); + assert(ST->hasV8_4aOps() && "ldp/stp not single copy atomic before v8.4a"); + LLT s64 = LLT::scalar(64); + MachineInstrBuilder NewI; + if (MI.getOpcode() == TargetOpcode::G_LOAD) { + NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {}); + MIRBuilder.buildMerge(ValReg, {NewI->getOperand(0), NewI->getOperand(1)}); + } else { + auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0)); + NewI = MIRBuilder.buildInstr( + AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)}); + } + Register Base; + int Offset; + matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); + NewI.addUse(Base); + NewI.addImm(Offset / 8); + + NewI.cloneMemRefs(MI); + constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(), + *MRI.getTargetRegisterInfo(), + *ST->getRegBankInfo()); + MI.eraseFromParent(); + return true; + } + if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || ValTy.getElementType().getAddressSpace() != 0) { LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); Index: llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O1 -; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=apple-a13 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1 +; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1 ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O0 -; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mcpu=apple-a13 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0 +; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0 @var = global i128 0 define void @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) { @@ -411,7 +411,7 @@ ; CHECK-CAS-O0-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-CAS-O0-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload ; CHECK-CAS-O0-NEXT: ldxp x9, x10, [x11] -; CHECK-CAS-O0-NEXT: mov x8, #0 +; CHECK-CAS-O0-NEXT: mov x8, xzr ; CHECK-CAS-O0-NEXT: orr x9, x9, x8 ; CHECK-CAS-O0-NEXT: orr x10, x8, x10 ; CHECK-CAS-O0-NEXT: // implicit-def: $q0 Index: llvm/test/CodeGen/AArch64/atomic-ops-lse.ll =================================================================== --- llvm/test/CodeGen/AArch64/atomic-ops-lse.ll +++ llvm/test/CodeGen/AArch64/atomic-ops-lse.ll @@ -3,7 +3,7 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics < %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS ; RUN: llc -mtriple=aarch64_be-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s --check-prefix=CHECK-REG -; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira -mattr=-v8.4a < %s | FileCheck %s ; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created ; (i.e. reusing a register for status & data in store exclusive).