diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -453,6 +453,7 @@ LDP, LDNP, + LDNP128, STP, STNP, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -798,6 +798,11 @@ setOperationAction(ISD::LOAD, MVT::v8f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f64, Custom); setOperationAction(ISD::LOAD, MVT::v4i64, Custom); + // LDNP lower than 256 + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i64, Custom); + setOperationAction(ISD::LOAD, MVT::v8i16, Custom); + setOperationAction(ISD::LOAD, MVT::v16i8, Custom); // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. @@ -2325,6 +2330,7 @@ MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) MAKE_CASE(AArch64ISD::LDP) MAKE_CASE(AArch64ISD::LDNP) + MAKE_CASE(AArch64ISD::LDNP128) MAKE_CASE(AArch64ISD::STP) MAKE_CASE(AArch64ISD::STNP) MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) @@ -5353,7 +5359,28 @@ SDLoc DL(Op); LoadSDNode *LoadNode = cast(Op); assert(LoadNode && "Expected custom lowering of a load node"); - + // Handle lowering 128-bit non temporal loads for little-endian targets + EVT MemVT = LoadNode->getMemoryVT(); + if (LoadNode->isNonTemporal() && + Subtarget->isLittleEndian() && + MemVT.getSizeInBits() == 128 && + (MemVT.getScalarSizeInBits() == 8u || + MemVT.getScalarSizeInBits() == 16u || + MemVT.getScalarSizeInBits() == 32u || + MemVT.getScalarSizeInBits() == 64u)) { + + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::LDNP128, DL, + DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + MVT::Other}), + {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), + LoadNode->getMemOperand()); + + SDValue P = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), MemVT, + Result.getValue(0), Result.getValue(1)); + return DAG.getMergeValues({P, Result.getValue(2) /* Chain */}, DL); + } if (LoadNode->getMemoryVT() == MVT::i64x8) { SmallVector Ops; SDValue Base = LoadNode->getBasePtr(); @@ -5374,9 +5401,8 @@ // Custom lowering for extending v4i8 vector loads. EVT VT = Op->getValueType(0); - assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); - if (LoadNode->getMemoryVT() != MVT::v4i8) + if ((VT != MVT::v4i16 && VT != MVT::v4i32) || LoadNode->getMemoryVT() != MVT::v4i8) return SDValue(); unsigned ExtType; @@ -20607,7 +20633,8 @@ MemSDNode *LoadNode = cast(N); EVT MemVT = LoadNode->getMemoryVT(); // Handle lowering 256 bit non temporal loads into LDNP. - if (LoadNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && + if (LoadNode->isNonTemporal() && + MemVT.getSizeInBits() == 256u && (MemVT.getScalarSizeInBits() == 8u || MemVT.getScalarSizeInBits() == 16u || MemVT.getScalarSizeInBits() == 32u || diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -319,6 +319,7 @@ def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64ldnp128 : SDTypeProfile<2, 1, [SDTCisVT<0, v2i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; @@ -732,6 +733,7 @@ def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def AArch64ldnp : SDNode<"AArch64ISD::LDNP", SDT_AArch64ldnp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def AArch64ldnp128 : SDNode<"AArch64ISD::LDNP128", SDT_AArch64ldnp128, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -2592,6 +2594,9 @@ def : Pat<(AArch64ldnp (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)), (LDNPQi GPR64sp:$Rn, simm7s16:$offset)>; + +def : Pat<(AArch64ldnp128 (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (LDNPDi GPR64sp:$Rn, simm7s8:$offset)>; //--- // (register offset) //--- diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll --- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -127,7 +127,8 @@ define <4 x i32> @test_ldnp_v4i32(<4 x i32>* %A) { ; CHECK-LABEL: test_ldnp_v4i32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v4i32: @@ -141,7 +142,8 @@ define <4 x float> @test_ldnp_v4f32(<4 x float>* %A) { ; CHECK-LABEL: test_ldnp_v4f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v4f32: @@ -155,7 +157,8 @@ define <8 x i16> @test_ldnp_v8i16(<8 x i16>* %A) { ; CHECK-LABEL: test_ldnp_v8i16: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v8i16: @@ -169,7 +172,8 @@ define <16 x i8> @test_ldnp_v16i8(<16 x i8>* %A) { ; CHECK-LABEL: test_ldnp_v16i8: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v16i8: @@ -182,7 +186,8 @@ define <2 x double> @test_ldnp_v2f64(<2 x double>* %A) { ; CHECK-LABEL: test_ldnp_v2f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v2f64: @@ -376,12 +381,18 @@ define <17 x float> @test_ldnp_v17f32(<17 x float>* %A) { ; CHECK-LABEL: test_ldnp_v17f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q1, q2, [x0, #32] -; CHECK-NEXT: ldp q3, q4, [x0] -; CHECK-NEXT: ldr s0, [x0, #64] -; CHECK-NEXT: stp q3, q4, [x8] -; CHECK-NEXT: stp q1, q2, [x8, #32] -; CHECK-NEXT: str s0, [x8, #64] +; CHECK-NEXT: ldnp d1, d0, [x0, #16] +; CHECK-NEXT: ldnp d3, d2, [x0, #48] +; CHECK-NEXT: ldnp d6, d5, [x0, #32] +; CHECK-NEXT: mov.d v1[1], v0[0] +; CHECK-NEXT: ldnp d16, d7, [x0] +; CHECK-NEXT: mov.d v3[1], v2[0] +; CHECK-NEXT: ldr s4, [x0, #64] +; CHECK-NEXT: mov.d v6[1], v5[0] +; CHECK-NEXT: mov.d v16[1], v7[0] +; CHECK-NEXT: str s4, [x8, #64] +; CHECK-NEXT: stp q6, q3, [x8, #32] +; CHECK-NEXT: stp q16, q1, [x8] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v17f32: @@ -410,24 +421,61 @@ define <33 x double> @test_ldnp_v33f64(<33 x double>* %A) { ; CHECK-LABEL: test_ldnp_v33f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q4, q5, [x0, #64] -; CHECK-NEXT: ldp q6, q7, [x0, #96] -; CHECK-NEXT: ldp q16, q17, [x0, #128] -; CHECK-NEXT: ldp q18, q19, [x0, #160] -; CHECK-NEXT: ldp q21, q22, [x0, #224] -; CHECK-NEXT: ldp q23, q24, [x0, #192] -; CHECK-NEXT: ldr d20, [x0, #256] -; CHECK-NEXT: stp q0, q1, [x8] -; CHECK-NEXT: stp q2, q3, [x8, #32] -; CHECK-NEXT: stp q4, q5, [x8, #64] -; CHECK-NEXT: str d20, [x8, #256] -; CHECK-NEXT: stp q6, q7, [x8, #96] -; CHECK-NEXT: stp q16, q17, [x8, #128] -; CHECK-NEXT: stp q18, q19, [x8, #160] -; CHECK-NEXT: stp q23, q24, [x8, #192] -; CHECK-NEXT: stp q21, q22, [x8, #224] +; CHECK-NEXT: stp d13, d12, [sp, #-48]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: stp d11, d10, [sp, #16] ; 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #32] ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: ldnp d31, d30, [x0, #240] +; CHECK-NEXT: ldnp d9, d8, [x0, #224] +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: mov.d v31[1], v30[0] +; CHECK-NEXT: ldnp d2, d3, [x0, #16] +; CHECK-NEXT: mov.d v9[1], v8[0] +; CHECK-NEXT: ldnp d4, d5, [x0, #32] +; CHECK-NEXT: mov.d v0[1], v1[0] +; CHECK-NEXT: ldnp d6, d7, [x0, #48] +; CHECK-NEXT: mov.d v2[1], v3[0] +; CHECK-NEXT: ldnp d16, d17, [x0, #64] +; CHECK-NEXT: mov.d v4[1], v5[0] +; CHECK-NEXT: ldnp d18, d19, [x0, #80] +; CHECK-NEXT: mov.d v6[1], v7[0] +; CHECK-NEXT: ldnp d21, d20, [x0, #96] +; CHECK-NEXT: mov.d v16[1], v17[0] +; CHECK-NEXT: ldnp d23, d22, [x0, #112] +; CHECK-NEXT: mov.d v18[1], v19[0] +; CHECK-NEXT: ldnp d25, d24, [x0, #144] +; CHECK-NEXT: mov.d v21[1], v20[0] +; CHECK-NEXT: ldnp d27, d26, [x0, #176] +; CHECK-NEXT: mov.d v23[1], v22[0] +; CHECK-NEXT: ldnp d29, d28, [x0, #208] +; CHECK-NEXT: mov.d v25[1], v24[0] +; CHECK-NEXT: ldnp d11, d10, [x0, #192] +; CHECK-NEXT: mov.d v27[1], v26[0] +; CHECK-NEXT: ldnp d12, d30, [x0, #160] +; CHECK-NEXT: mov.d v29[1], v28[0] +; CHECK-NEXT: ldnp d13, d8, [x0, #128] +; CHECK-NEXT: mov.d v11[1], v10[0] +; CHECK-NEXT: ldr d28, [x0, #256] +; CHECK-NEXT: stp q0, q2, [x8] +; CHECK-NEXT: mov.d v12[1], v30[0] +; CHECK-NEXT: stp q4, q6, [x8, #32] +; CHECK-NEXT: stp q16, q18, [x8, #64] +; CHECK-NEXT: mov.d v13[1], v8[0] +; CHECK-NEXT: str d28, [x8, #256] +; CHECK-NEXT: stp q21, q23, [x8, #96] +; CHECK-NEXT: stp q12, q27, [x8, #160] +; CHECK-NEXT: stp q11, q29, [x8, #192] +; CHECK-NEXT: stp q13, q25, [x8, #128] +; CHECK-NEXT: stp q9, q31, [x8, #224] +; CHECK-NEXT: ldp d9, d8, [sp, #32] ; 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #16] ; 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp], #48 ; 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v33f64: @@ -504,10 +552,13 @@ define <33 x i8> @test_ldnp_v33i8(<33 x i8>* %A) { ; CHECK-LABEL: test_ldnp_v33i8: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldnp d1, d0, [x0, #16] +; CHECK-NEXT: ldnp d3, d2, [x0] ; CHECK-NEXT: ldrb w9, [x0, #32] -; CHECK-NEXT: stp q1, q0, [x8] +; CHECK-NEXT: mov.d v1[1], v0[0] +; CHECK-NEXT: mov.d v3[1], v2[0] ; CHECK-NEXT: strb w9, [x8, #32] +; CHECK-NEXT: stp q3, q1, [x8] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v33i8: @@ -612,14 +663,9 @@ define <5 x double> @test_ldnp_v5f64(<5 x double>* %A) { ; CHECK-LABEL: test_ldnp_v5f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] -; CHECK-NEXT: ext.16b v1, v0, v0, #8 -; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q1 -; CHECK-NEXT: ext.16b v3, v2, v2, #8 +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: ldnp d2, d3, [x0, #16] ; CHECK-NEXT: ldr d4, [x0, #32] -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q2 -; CHECK-NEXT: ; kill: def $d3 killed $d3 killed $q3 ; CHECK-NEXT: ; kill: def $d4 killed $d4 killed $q4 ; CHECK-NEXT: ret ;