diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -272,7 +272,8 @@ STZ2G, LDP, - STP + STP, + STNP }; } // end namespace AArch64ISD diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -525,6 +525,17 @@ setOperationAction(ISD::LOAD, MVT::i128, Custom); setOperationAction(ISD::STORE, MVT::i128, Custom); + // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the + // custom lowering, as there are no un-paired non-temporal stores and + // legalization will break up 256 bit inputs. + setOperationAction(ISD::STORE, MVT::v32i8, Custom); + setOperationAction(ISD::STORE, MVT::v16i16, Custom); + setOperationAction(ISD::STORE, MVT::v16f16, Custom); + setOperationAction(ISD::STORE, MVT::v8i32, Custom); + setOperationAction(ISD::STORE, MVT::v8f32, Custom); + setOperationAction(ISD::STORE, MVT::v4f64, Custom); + setOperationAction(ISD::STORE, MVT::v4i64, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) @@ -1382,6 +1393,7 @@ case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; case AArch64ISD::LDP: return "AArch64ISD::LDP"; case AArch64ISD::STP: return "AArch64ISD::STP"; + case AArch64ISD::STNP: return "AArch64ISD::STNP"; } return nullptr; } @@ -3070,6 +3082,30 @@ if (StoreNode->isTruncatingStore()) { return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); } + // 256 bit non-temporal stores can be lowered to STNP. Do this as part of + // the custom lowering, as there are no un-paired non-temporal stores and + // legalization will break up 256 bit inputs. + if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && + MemVT.getVectorElementCount().Min % 2u == 0 && + ((MemVT.getScalarSizeInBits() == 8u || + MemVT.getScalarSizeInBits() == 16u || + MemVT.getScalarSizeInBits() == 32u || + MemVT.getScalarSizeInBits() == 64u))) { + SDValue Lo = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); + SDValue Hi = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, Dl, + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + StoreNode->getValue(), + DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64)); + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), + {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, + StoreNode->getMemoryVT(), StoreNode->getMemOperand()); + return Result; + } } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { assert(StoreNode->getValue()->getValueType(0) == MVT::i128); SDValue Lo = diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -245,6 +245,7 @@ def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; // Generates the general dynamic sequences, i.e. // adrp x0, :tlsdesc:var @@ -544,6 +545,7 @@ def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; @@ -2734,6 +2736,10 @@ def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>; +def : Pat<(AArch64stnp FPR128:$Rt, FPR128:$Rt2, (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)), + (STNPQi FPR128:$Rt, FPR128:$Rt2, GPR64sp:$Rn, simm7s16:$offset)>; + + //--- // (Register offset) diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll --- a/llvm/test/CodeGen/AArch64/nontemporal.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal.ll @@ -2,10 +2,7 @@ define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 { ; CHECK-LABEL: test_stnp_v4i64: -; CHECK-NEXT: mov d[[HI1:[0-9]+]], v1[1] -; CHECK-NEXT: mov d[[HI0:[0-9]+]], v0[1] -; CHECK-NEXT: stnp d1, d[[HI1]], [x0, #16] -; CHECK-NEXT: stnp d0, d[[HI0]], [x0] +; CHECK-NEXT: stnp q0, q1, [x0] ; CHECK-NEXT: ret store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0 ret void @@ -334,6 +331,149 @@ ret void } +define void @test_stnp_v32i8(<32 x i8> %v, <32 x i8>* %ptr) { +; CHECK-LABEL: _test_stnp_v32i8: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: stnp q0, q1, [x0] +; CHECK-NEXT: ret + +entry: + store <32 x i8> %v, <32 x i8>* %ptr, align 4, !nontemporal !0 + ret void +} + +define void @test_stnp_v32i16(<32 x i16> %v, <32 x i16>* %ptr) { +; CHECK-LABEL: _test_stnp_v32i16: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: stnp q2, q3, [x0, #32] +; CHECK-NEXT: stnp q0, q1, [x0] +; CHECK-NEXT: ret + +entry: + store <32 x i16> %v, <32 x i16>* %ptr, align 4, !nontemporal !0 + ret void +} + +define void @test_stnp_v32f16(<32 x half> %v, <32 x half>* %ptr) { +; CHECK-LABEL: _test_stnp_v32f16: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: stnp q2, q3, [x0, #32] +; CHECK-NEXT: stnp q0, q1, [x0] +; CHECK-NEXT: ret + +entry: + store <32 x half> %v, <32 x half>* %ptr, align 4, !nontemporal !0 + ret void +} + +define void @test_stnp_v16i32(<16 x i32> %v, <16 x i32>* %ptr) { +; CHECK-LABEL: _test_stnp_v16i32: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: stnp q2, q3, [x0, #32] +; CHECK-NEXT: stnp q0, q1, [x0] +; CHECK-NEXT: ret + +entry: + store <16 x i32> %v, <16 x i32>* %ptr, align 4, !nontemporal !0 + ret void +} + +define void @test_stnp_v16f32(<16 x float> %v, <16 x float>* %ptr) { +; CHECK-LABEL: _test_stnp_v16f32: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: stnp q2, q3, [x0, #32] +; CHECK-NEXT: stnp q0, q1, [x0] +; CHECK-NEXT: ret + +entry: + store <16 x float> %v, <16 x float>* %ptr, align 4, !nontemporal !0 + ret void +} + +define void @test_stnp_v17f32(<17 x float> %v, <17 x float>* %ptr) { +; CHECK-LABEL: _test_stnp_v17f32: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: ldr s16, [sp, #16] +; CHECK-NEXT: mov.s v0[1], v1[0] +; CHECK-NEXT: mov.s v4[1], v5[0] +; CHECK-NEXT: ldr s1, [sp] +; CHECK-NEXT: add x8, sp, #20 +; CHECK-NEXT: ld1.s { v16 }[1], [x8] +; CHECK-NEXT: add x8, sp, #4 +; CHECK-NEXT: ld1.s { v1 }[1], [x8] +; CHECK-NEXT: add x8, sp, #24 +; CHECK-NEXT: ld1.s { v16 }[2], [x8] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: ld1.s { v1 }[2], [x8] +; CHECK-NEXT: add x8, sp, #28 +; CHECK-NEXT: ld1.s { v16 }[3], [x8] +; CHECK-NEXT: add x8, sp, #12 +; CHECK-NEXT: mov.s v0[2], v2[0] +; CHECK-NEXT: ldr s2, [sp, #32] +; CHECK-NEXT: mov.s v4[2], v6[0] +; CHECK-NEXT: mov.s v0[3], v3[0] +; CHECK-NEXT: mov.s v4[3], v7[0] +; CHECK-NEXT: mov d3, v4[1] +; CHECK-NEXT: mov d5, v0[1] +; CHECK-NEXT: ld1.s { v1 }[3], [x8] +; CHECK-NEXT: stnp d4, d3, [x0, #16] +; CHECK-NEXT: stnp d0, d5, [x0] +; CHECK-NEXT: mov d0, v16[1] +; CHECK-NEXT: mov d3, v1[1] +; CHECK-NEXT: stnp d16, d0, [x0, #48] +; CHECK-NEXT: stnp d1, d3, [x0, #32] +; CHECK-NEXT: str s2, [x0, #64] +; CHECK-NEXT: ret + +entry: + store <17 x float> %v, <17 x float>* %ptr, align 4, !nontemporal !0 + ret void +} +define void @test_stnp_v16i32_invalid_offset(<16 x i32> %v, <16 x i32>* %ptr) { +; CHECK-LABEL: _test_stnp_v16i32_invalid_offset: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: mov w8, #32000 +; CHECK-NEXT: mov w9, #32032 +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: add x9, x0, x9 +; CHECK-NEXT: stnp q2, q3, [x9] +; CHECK-NEXT: stnp q0, q1, [x8] +; CHECK-NEXT: ret + +entry: + %gep = getelementptr <16 x i32>, <16 x i32>* %ptr, i32 500 + store <16 x i32> %v, <16 x i32>* %gep, align 4, !nontemporal !0 + ret void +} + +define void @test_stnp_v16f64(<16 x double> %v, <16 x double>* %ptr) { +; CHECK-LABEL: _test_stnp_v16f64: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: stnp q6, q7, [x0, #96] +; CHECK-NEXT: stnp q4, q5, [x0, #64] +; CHECK-NEXT: stnp q2, q3, [x0, #32] +; CHECK-NEXT: stnp q0, q1, [x0] +; CHECK-NEXT: ret + +entry: + store <16 x double> %v, <16 x double>* %ptr, align 4, !nontemporal !0 + ret void +} + +define void @test_stnp_v16i64(<16 x i64> %v, <16 x i64>* %ptr) { +; CHECK-LABEL: _test_stnp_v16i64: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: stnp q6, q7, [x0, #96] +; CHECK-NEXT: stnp q4, q5, [x0, #64] +; CHECK-NEXT: stnp q2, q3, [x0, #32] +; CHECK-NEXT: stnp q0, q1, [x0] +; CHECK-NEXT: ret + +entry: + store <16 x i64> %v, <16 x i64>* %ptr, align 4, !nontemporal !0 + ret void +} + !0 = !{ i32 1 } attributes #0 = { nounwind }