Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10640,11 +10640,79 @@ return std::make_pair(Lo, Hi); } +// Create an even/odd pair of X registers holding integer value V. +static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { + SDLoc dl(V.getNode()); + SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64); + SDValue VHi = DAG.getAnyExtOrTrunc( + DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)), + dl, MVT::i64); + if (DAG.getDataLayout().isBigEndian()) + std::swap (VLo, VHi); + SDValue RegClass = + DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32); + SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32); + SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32); + const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; + return SDValue( + DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); +} + static void ReplaceCMP_SWAP_128Results(SDNode *N, - SmallVectorImpl & Results, - SelectionDAG &DAG) { + SmallVectorImpl &Results, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { assert(N->getValueType(0) == MVT::i128 && "AtomicCmpSwap on types less than 128 should be legal"); + + if (Subtarget->hasLSE()) { + // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type, + // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG. + SDValue Ops[] = { + createGPRPairNode(DAG, N->getOperand(2)), // Compare value + createGPRPairNode(DAG, N->getOperand(3)), // Store value + N->getOperand(1), // Ptr + N->getOperand(0), // Chain in + }; + + MachineFunction &MF = DAG.getMachineFunction(); + MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + + unsigned Opcode; + switch (MemOp[0]->getOrdering()) { + case AtomicOrdering::Monotonic: + Opcode = AArch64::CASPX; + break; + case AtomicOrdering::Acquire: + Opcode = AArch64::CASPAX; + break; + case AtomicOrdering::Release: + Opcode = AArch64::CASPLX; + break; + case AtomicOrdering::AcquireRelease: + case AtomicOrdering::SequentiallyConsistent: + Opcode = AArch64::CASPALX; + break; + default: + llvm_unreachable("Unexpected ordering!"); + } + + MachineSDNode *CmpSwap = DAG.getMachineNode( + Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops); + CmpSwap->setMemRefs(MemOp, MemOp + 1); + + unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; + if (DAG.getDataLayout().isBigEndian()) + std::swap(SubReg1, SubReg2); + Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, + SDValue(CmpSwap, 0))); + Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, + SDValue(CmpSwap, 0))); + Results.push_back(SDValue(CmpSwap, 1)); // Chain out + return; + } + auto Desired = splitInt128(N->getOperand(2), DAG); auto New = splitInt128(N->getOperand(3), DAG); SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second, @@ -10703,7 +10771,7 @@ // Let normal code take care of it by not adding anything to Results. return; case ISD::ATOMIC_CMP_SWAP: - ReplaceCMP_SWAP_128Results(N, Results, DAG); + ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); return; } } Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2592,6 +2592,16 @@ assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Twov1d; Offset = false; + } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { + BuildMI(MBB, MBBI, DL, get(AArch64::STPXi)) + .addReg(TRI->getSubReg(SrcReg, AArch64::sube64), + getKillRegState(isKill)) + .addReg(TRI->getSubReg(SrcReg, AArch64::subo64), + getKillRegState(isKill)) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); + return; } break; case 24: @@ -2690,6 +2700,16 @@ assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Twov1d; Offset = false; + } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { + BuildMI(MBB, MBBI, DL, get(AArch64::LDPXi)) + .addReg(TRI->getSubReg(DestReg, AArch64::sube64), + getDefRegState(true)) + .addReg(TRI->getSubReg(DestReg, AArch64::subo64), + getDefRegState(true)) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); + return; } break; case 24: Index: test/CodeGen/AArch64/atomic-ops-lse.ll =================================================================== --- test/CodeGen/AArch64/atomic-ops-lse.ll +++ test/CodeGen/AArch64/atomic-ops-lse.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s +; RUN: llc -mtriple=aarch64_be-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s --check-prefix=CHECK-REG ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira < %s | FileCheck %s @@ -11,6 +12,7 @@ @var16 = global i16 0 @var32 = global i32 0 @var64 = global i64 0 +@var128 = global i128 0 define i8 @test_atomic_load_add_i8(i8 %offset) nounwind { ; CHECK-LABEL: test_atomic_load_add_i8: @@ -682,6 +684,21 @@ ret i64 %old } +define i128 @test_atomic_cmpxchg_i128(i128 %wanted, i128 %new) nounwind { +; CHECK-LABEL: test_atomic_cmpxchg_i128: + %pair = cmpxchg i128* @var128, i128 %wanted, i128 %new acquire acquire + %old = extractvalue { i128, i1 } %pair, 0 + +; CHECK-NOT: dmb +; CHECK: adrp [[TMPADDR:x[0-9]+]], var128 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var128 + +; CHECK: caspa x0, x1, x2, x3, [x[[ADDR]]] +; CHECK-NOT: dmb + + ret i128 %old +} + define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind { ; CHECK-LABEL: test_atomic_load_sub_i8: %old = atomicrmw sub i8* @var8, i8 %offset seq_cst @@ -1674,6 +1691,21 @@ ret i64 %old } +define i128 @test_atomic_cmpxchg_i128_acquire(i128 %wanted, i128 %new) nounwind { +; CHECK-LABEL: test_atomic_cmpxchg_i128_acquire: + %pair = cmpxchg i128* @var128, i128 %wanted, i128 %new acquire acquire + %old = extractvalue { i128, i1 } %pair, 0 + +; CHECK-NOT: dmb +; CHECK: adrp [[TMPADDR:x[0-9]+]], var128 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var128 + +; CHECK: caspa x0, x1, x2, x3, [x[[ADDR]]] +; CHECK-NOT: dmb + + ret i128 %old +} + define i8 @test_atomic_cmpxchg_i8_monotonic(i8 %wanted, i8 %new) nounwind { ; CHECK-LABEL: test_atomic_cmpxchg_i8_monotonic: %pair = cmpxchg i8* @var8, i8 %wanted, i8 %new monotonic monotonic @@ -1734,6 +1766,21 @@ ret i64 %old } +define i128 @test_atomic_cmpxchg_i128_monotonic(i128 %wanted, i128 %new) nounwind { +; CHECK-LABEL: test_atomic_cmpxchg_i128_monotonic: + %pair = cmpxchg i128* @var128, i128 %wanted, i128 %new monotonic monotonic + %old = extractvalue { i128, i1 } %pair, 0 + +; CHECK-NOT: dmb +; CHECK: adrp [[TMPADDR:x[0-9]+]], var128 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var128 + +; CHECK: casp x0, x1, x2, x3, [x[[ADDR]]] +; CHECK-NOT: dmb + + ret i128 %old +} + define i8 @test_atomic_cmpxchg_i8_seq_cst(i8 %wanted, i8 %new) nounwind { ; CHECK-LABEL: test_atomic_cmpxchg_i8_seq_cst: %pair = cmpxchg i8* @var8, i8 %wanted, i8 %new seq_cst seq_cst @@ -1794,6 +1841,21 @@ ret i64 %old } +define i128 @test_atomic_cmpxchg_i128_seq_cst(i128 %wanted, i128 %new) nounwind { +; CHECK-LABEL: test_atomic_cmpxchg_i128_seq_cst: + %pair = cmpxchg i128* @var128, i128 %wanted, i128 %new seq_cst seq_cst + %old = extractvalue { i128, i1 } %pair, 0 + +; CHECK-NOT: dmb +; CHECK: adrp [[TMPADDR:x[0-9]+]], var128 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var128 + +; CHECK: caspal x0, x1, x2, x3, [x[[ADDR]]] +; CHECK-NOT: dmb + + ret i128 %old +} + define i8 @test_atomic_load_max_i8_acq_rel(i8 %offset) nounwind { ; CHECK-LABEL: test_atomic_load_max_i8_acq_rel: %old = atomicrmw max i8* @var8, i8 %offset acq_rel