diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -260,8 +260,10 @@ STG, STZG, ST2G, - STZ2G + STZ2G, + LDP, + STP }; } // end namespace AArch64ISD diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -516,6 +516,10 @@ setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); + // 128-bit loads and stores can be done without expanding + setOperationAction(ISD::LOAD, MVT::i128, Custom); + setOperationAction(ISD::STORE, MVT::i128, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) @@ -1364,6 +1368,8 @@ case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED"; case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED"; case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; + case AArch64ISD::LDP: return "AArch64ISD::LDP"; + case AArch64ISD::STP: return "AArch64ISD::STP"; } return nullptr; } @@ -2988,7 +2994,7 @@ // Custom lowering for any store, vector or scalar and/or default or with // a truncate operations. Currently only custom lower truncate operation -// from vector v4i16 to v4i8. +// from vector v4i16 to v4i8 or volatile stores of i128. SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc Dl(Op); @@ -3000,18 +3006,32 @@ EVT VT = Value.getValueType(); EVT MemVT = StoreNode->getMemoryVT(); - assert (VT.isVector() && "Can only custom lower vector store types"); - - unsigned AS = StoreNode->getAddressSpace(); - unsigned Align = StoreNode->getAlignment(); - if (Align < MemVT.getStoreSize() && - !allowsMisalignedMemoryAccesses( - MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) { - return scalarizeVectorStore(StoreNode, DAG); - } - - if (StoreNode->isTruncatingStore()) { - return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); + if (VT.isVector()) { + unsigned AS = StoreNode->getAddressSpace(); + unsigned Align = StoreNode->getAlignment(); + if (Align < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Align, + StoreNode->getMemOperand()->getFlags(), + nullptr)) { + return scalarizeVectorStore(StoreNode, DAG); + } + + if (StoreNode->isTruncatingStore()) { + return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); + } + } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { + assert(StoreNode->getValue()->getValueType(0) == MVT::i128); + SDValue Lo = + DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(), + DAG.getConstant(0, Dl, MVT::i64)); + SDValue Hi = + DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(), + DAG.getConstant(1, Dl, MVT::i64)); + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other), + {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, + StoreNode->getMemoryVT(), StoreNode->getMemOperand()); + return Result; } return SDValue(); @@ -12689,6 +12709,27 @@ case ISD::ATOMIC_CMP_SWAP: ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); return; + case ISD::LOAD: { + assert(SDValue(N, 0).getValueType() == MVT::i128 && + "unexpected load's value type"); + LoadSDNode *LoadNode = cast(N); + if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) { + // Non-volatile loads are optimized later in AArch64's load/store + // optimizer. + return; + } + + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::LDP, SDLoc(N), + DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), + {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), + LoadNode->getMemOperand()); + + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, + Result.getValue(0), Result.getValue(1)); + Results.append({Pair, Result.getValue(2) /* Chain */}); + return; + } } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -243,6 +243,9 @@ def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; +def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; + // Generates the general dynamic sequences, i.e. // adrp x0, :tlsdesc:var // ldr x1, [x0, #:tlsdesc_lo12:var] @@ -535,6 +538,9 @@ def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>; def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>; +def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -1987,6 +1993,9 @@ defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64Op, simm7s8, "ldnp">; defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">; +def : Pat<(AArch64ldp (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (LDPXi GPR64sp:$Rn, simm7s8:$offset)>; + //--- // (register offset) //--- @@ -2680,6 +2689,9 @@ defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">; defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">; +def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>; + //--- // (Register offset) diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll b/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll --- a/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll +++ b/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll @@ -87,10 +87,8 @@ define {i128, i1} @test_cmpxchg_128_unsplit(i128* %addr) { ; CHECK-LABEL: test_cmpxchg_128_unsplit: ; CHECK: add x[[VAR128:[0-9]+]], {{x[0-9]+}}, :lo12:var128 -; CHECK: ldr [[DESIRED_HI:x[0-9]+]], [x[[VAR128]], #8] -; CHECK: ldr [[DESIRED_LO:x[0-9]+]], [x[[VAR128]]] -; CHECK: ldr [[NEW_HI:x[0-9]+]], [x[[VAR128]], #8] -; CHECK: ldr [[NEW_LO:x[0-9]+]], [x[[VAR128]]] +; CHECK: ldp [[DESIRED_LO:x[0-9]+]], [[DESIRED_HI:x[0-9]+]], [x[[VAR128]]] +; CHECK: ldp [[NEW_LO:x[0-9]+]], [[NEW_HI:x[0-9]+]], [x[[VAR128]]] ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: ; CHECK: ldaxp [[OLD_LO:x[0-9]+]], [[OLD_HI:x[0-9]+]], [x0] ; CHECK: cmp [[OLD_LO]], [[DESIRED_LO]] diff --git a/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll b/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll @@ -0,0 +1,117 @@ +; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s + +@x = common dso_local global i128 0 +@y = common dso_local global i128 0 + +define void @test1() { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x +; CHECK-NEXT: add x8, x8, :lo12:x +; CHECK-NEXT: ldp x8, x9, [x8] +; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: stp x8, x9, [x10] +; CHECK-NEXT: ret + %tmp = load volatile i128, i128* @x + store volatile i128 %tmp, i128* @y + ret void +} + +define void @test2() { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x +; CHECK-NEXT: add x8, x8, :lo12:x +; CHECK-NEXT: ldp x8, x9, [x8, #504] +; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: stp x8, x9, [x10, #504] +; CHECK-NEXT: ret + %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 504) to i128*) + store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 504) to i128*) + ret void +} + +define void @test3() { +; CHECK-LABEL: test3: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x +; CHECK-NEXT: add x8, x8, :lo12:x +; CHECK-NEXT: add x8, x8, #512 // =512 +; CHECK-NEXT: ldp x8, x9, [x8] +; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: add x10, x10, #512 // =512 +; CHECK-NEXT: stp x8, x9, [x10] +; CHECK-NEXT: ret + %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 512) to i128*) + store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 512) to i128*) + ret void +} + +define void @test4() { +; CHECK-LABEL: test4: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x +; CHECK-NEXT: add x8, x8, :lo12:x +; CHECK-NEXT: ldp x8, x9, [x8, #-512] +; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: stp x8, x9, [x10, #-512] +; CHECK-NEXT: ret + %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -512) to i128*) + store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 -512) to i128*) + ret void +} + +define void @test5() { +; CHECK-LABEL: test5: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x +; CHECK-NEXT: add x8, x8, :lo12:x +; CHECK-NEXT: sub x8, x8, #520 // =520 +; CHECK-NEXT: ldp x8, x9, [x8] +; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: sub x10, x10, #520 // =520 +; CHECK-NEXT: stp x8, x9, [x10] +; CHECK-NEXT: ret + %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -520) to i128*) + store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 -520) to i128*) + ret void +} + +define void @test6() { +; CHECK-LABEL: test6: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x +; CHECK-NEXT: add x8, x8, :lo12:x +; CHECK-NEXT: sub x8, x8, #520 // =520 +; CHECK-NEXT: ldp x8, x9, [x8] +; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: sub x10, x10, #520 // =520 +; CHECK-NEXT: stp x8, x9, [x10] +; CHECK-NEXT: ret + %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -520) to i128*) + store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 -520) to i128*) + ret void +} + +define void @test7() { +; CHECK-LABEL: test7: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x +; CHECK-NEXT: add x8, x8, :lo12:x +; CHECK-NEXT: add x8, x8, #503 // =503 +; CHECK-NEXT: ldp x8, x9, [x8] +; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: add x10, x10, #503 // =503 +; CHECK-NEXT: stp x8, x9, [x10] +; CHECK-NEXT: ret + %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 503) to i128*) + store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 503) to i128*) + ret void +}