diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -450,6 +450,7 @@ STZ2G, LDP, + LDNP, STP, STNP, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -790,6 +790,13 @@ setOperationAction(ISD::STORE, MVT::v4f64, Custom); setOperationAction(ISD::STORE, MVT::v4i64, Custom); + setOperationAction(ISD::LOAD, MVT::v32i8, Custom); + setOperationAction(ISD::LOAD, MVT::v16i16, Custom); + setOperationAction(ISD::LOAD, MVT::v16f16, Custom); + setOperationAction(ISD::LOAD, MVT::v8i32, Custom); + setOperationAction(ISD::LOAD, MVT::v8f32, Custom); + setOperationAction(ISD::LOAD, MVT::v4f64, Custom); + setOperationAction(ISD::LOAD, MVT::v4i64, Custom); // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) @@ -2311,6 +2318,7 @@ MAKE_CASE(AArch64ISD::SSTNT1_PRED) MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) MAKE_CASE(AArch64ISD::LDP) + MAKE_CASE(AArch64ISD::LDNP) MAKE_CASE(AArch64ISD::STP) MAKE_CASE(AArch64ISD::STNP) MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) @@ -20375,9 +20383,34 @@ return; case ISD::ATOMIC_LOAD: case ISD::LOAD: { + + MemSDNode *LoadNode = cast(N); + EVT MemVT = LoadNode->getMemoryVT(); + ElementCount EC = MemVT.getVectorElementCount(); + if (LoadNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && + EC.isKnownEven() && + ((MemVT.getScalarSizeInBits() == 8u || + MemVT.getScalarSizeInBits() == 16u || + MemVT.getScalarSizeInBits() == 32u || + MemVT.getScalarSizeInBits() == 64u))) { + + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::LDNP, SDLoc(N), + DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + MVT::Other}), + {LoadNode->getChain(), LoadNode->getBasePtr()}, + LoadNode->getMemoryVT(), LoadNode->getMemOperand()); + + // SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, + // Result.getValue(0), Result.getValue(1)); + SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT, + Result.getValue(0), Result.getValue(1)); + Results.append({Pair, Result.getValue(2) /* Chain */}); + return; + } assert(SDValue(N, 0).getValueType() == MVT::i128 && "unexpected load's value type"); - MemSDNode *LoadNode = cast(N); if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) || LoadNode->getMemoryVT() != MVT::i128) { // Non-volatile or atomic loads are optimized later in AArch64's load/store diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -318,6 +318,7 @@ def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; @@ -728,6 +729,7 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>; def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def AArch64ldnp : SDNode<"AArch64ISD::LDNP", SDT_AArch64ldnp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -2581,6 +2583,8 @@ def : Pat<(AArch64ldp (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), (LDPXi GPR64sp:$Rn, simm7s8:$offset)>; +def : Pat<(AArch64ldnp (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)), + (LDNPQi GPR64sp:$Rn, simm7s16:$offset)>; //--- // (register offset) //--- diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll --- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -4,7 +4,7 @@ define <4 x double> @test_ldnp_v4f64(<4 x double>* %A) { ; CHECK-LABEL: test_ldnp_v4f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldnp q0, q1, [x0] ; CHECK-NEXT: ret %lv = load <4 x double>, <4 x double>* %A, align 8, !nontemporal !0 ret <4 x double> %lv @@ -13,7 +13,7 @@ define <4 x i64> @test_ldnp_v4i64(<4 x i64>* %A) { ; CHECK-LABEL: test_ldnp_v4i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldnp q0, q1, [x0] ; CHECK-NEXT: ret %lv = load <4 x i64>, <4 x i64>* %A, align 8, !nontemporal !0 ret <4 x i64> %lv @@ -21,7 +21,7 @@ define <8 x i32> @test_ldnp_v8i32(<8 x i32>* %A) { ; CHECK-LABEL: test_ldnp_v8i32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldnp q0, q1, [x0] ; CHECK-NEXT: ret %lv = load <8 x i32>, <8 x i32>* %A, align 8, !nontemporal !0 ret <8 x i32> %lv @@ -30,7 +30,7 @@ define <8 x float> @test_ldnp_v8f32(<8 x float>* %A) { ; CHECK-LABEL: test_ldnp_v8f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldnp q0, q1, [x0] ; CHECK-NEXT: ret %lv = load <8 x float>, <8 x float>* %A, align 8, !nontemporal !0 ret <8 x float> %lv @@ -39,7 +39,7 @@ define <16 x i16> @test_ldnp_v16i16(<16 x i16>* %A) { ; CHECK-LABEL: test_ldnp_v16i16: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldnp q0, q1, [x0] ; CHECK-NEXT: ret %lv = load <16 x i16>, <16 x i16>* %A, align 8, !nontemporal !0 ret <16 x i16> %lv @@ -48,7 +48,7 @@ define <16 x half> @test_ldnp_v16f16(<16 x half>* %A) { ; CHECK-LABEL: test_ldnp_v16f16: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldnp q0, q1, [x0] ; CHECK-NEXT: ret %lv = load <16 x half>, <16 x half>* %A, align 8, !nontemporal !0 ret <16 x half> %lv @@ -57,7 +57,7 @@ define <32 x i8> @test_ldnp_v32i8(<32 x i8>* %A) { ; CHECK-LABEL: test_ldnp_v32i8: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldnp q0, q1, [x0] ; CHECK-NEXT: ret %lv = load <32 x i8>, <32 x i8>* %A, align 8, !nontemporal !0 ret <32 x i8> %lv @@ -164,8 +164,8 @@ define <32 x i16> @test_ldnp_v32i16(<32 x i16>* %A) { ; CHECK-LABEL: test_ldnp_v32i16: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ldnp q0, q1, [x0] +; CHECK-NEXT: ldnp q2, q3, [x0, #32] ; CHECK-NEXT: ret %lv = load <32 x i16>, <32 x i16>* %A, align 8, !nontemporal !0 ret <32 x i16> %lv @@ -174,8 +174,8 @@ define <32 x half> @test_ldnp_v32f16(<32 x half>* %A) { ; CHECK-LABEL: test_ldnp_v32f16: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ldnp q0, q1, [x0] +; CHECK-NEXT: ldnp q2, q3, [x0, #32] ; CHECK-NEXT: ret %lv = load <32 x half>, <32 x half>* %A, align 8, !nontemporal !0 ret <32 x half> %lv @@ -184,8 +184,8 @@ define <16 x i32> @test_ldnp_v16i32(<16 x i32>* %A) { ; CHECK-LABEL: test_ldnp_v16i32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ldnp q0, q1, [x0] +; CHECK-NEXT: ldnp q2, q3, [x0, #32] ; CHECK-NEXT: ret %lv = load <16 x i32>, <16 x i32>* %A, align 8, !nontemporal !0 ret <16 x i32> %lv @@ -194,8 +194,8 @@ define <16 x float> @test_ldnp_v16f32(<16 x float>* %A) { ; CHECK-LABEL: test_ldnp_v16f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ldnp q0, q1, [x0] +; CHECK-NEXT: ldnp q2, q3, [x0, #32] ; CHECK-NEXT: ret %lv = load <16 x float>, <16 x float>* %A, align 8, !nontemporal !0 ret <16 x float> %lv @@ -218,10 +218,10 @@ define <16 x i64> @test_ldnp_v16i64(<16 x i64>* %A) { ; CHECK-LABEL: test_ldnp_v16i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q4, q5, [x0, #64] -; CHECK-NEXT: ldp q6, q7, [x0, #96] +; CHECK-NEXT: ldnp q0, q1, [x0] +; CHECK-NEXT: ldnp q2, q3, [x0, #32] +; CHECK-NEXT: ldnp q4, q5, [x0, #64] +; CHECK-NEXT: ldnp q6, q7, [x0, #96] ; CHECK-NEXT: ret %lv = load <16 x i64>, <16 x i64>* %A, align 8, !nontemporal !0 ret <16 x i64> %lv @@ -230,10 +230,10 @@ define <16 x double> @test_ldnp_v16f64(<16 x double>* %A) { ; CHECK-LABEL: test_ldnp_v16f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q4, q5, [x0, #64] -; CHECK-NEXT: ldp q6, q7, [x0, #96] +; CHECK-NEXT: ldnp q0, q1, [x0] +; CHECK-NEXT: ldnp q2, q3, [x0, #32] +; CHECK-NEXT: ldnp q4, q5, [x0, #64] +; CHECK-NEXT: ldnp q6, q7, [x0, #96] ; CHECK-NEXT: ret %lv = load <16 x double>, <16 x double>* %A, align 8, !nontemporal !0 ret <16 x double> %lv