Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1307,6 +1307,8 @@ def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredLoad_Intrinsic; +def int_aarch64_sve_ld1rq : AdvSIMD_1Vec_PredLoad_Intrinsic; + // // Stores // Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -224,6 +224,7 @@ LDNF1S, LDFF1, LDFF1S, + LD1RQ, // Unsigned gather loads. GLD1, Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1404,6 +1404,7 @@ case AArch64ISD::INSR: return "AArch64ISD::INSR"; case AArch64ISD::PTEST: return "AArch64ISD::PTEST"; case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE"; + case AArch64ISD::LD1RQ: return "AArch64ISD::LD1RQ"; case AArch64ISD::LDNF1: return "AArch64ISD::LDNF1"; case AArch64ISD::LDNF1S: return "AArch64ISD::LDNF1S"; case AArch64ISD::LDFF1: return "AArch64ISD::LDFF1"; @@ -11588,6 +11589,25 @@ return DAG.getMergeValues({ Load, LoadChain }, DL); } +static SDValue performLD1RQCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + EVT LoadVT = VT; + if (VT.isFloatingPoint()) + LoadVT = VT.changeTypeToInteger(); + + SDValue Ops[] = { N->getOperand(0), N->getOperand(2), N->getOperand(3) }; + SDValue L = DAG.getNode(AArch64ISD::LD1RQ, DL, { LoadVT, MVT::Other }, Ops); + + if (VT.isFloatingPoint()) { + SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) }; + return DAG.getMergeValues(Ops, DL); + } + + return L; +} + /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The /// load store optimizer pass will merge them to store pair stores. This should /// be better than a movi to create the vector zero followed by a vector store @@ -13139,6 +13159,8 @@ case Intrinsic::aarch64_sve_ld1: case Intrinsic::aarch64_sve_ldnt1: return performLD1Combine(N, DAG); + case Intrinsic::aarch64_sve_ld1rq: + return performLD1RQCombine(N, DAG); case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnt1_gather: Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -483,6 +483,19 @@ let ParserMatchClass = UImm6s16Operand; } +def SImmS2XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 2, SDLoc(N), MVT::i64); +}]>; +def SImmS3XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 3, SDLoc(N), MVT::i64); +}]>; +def SImmS4XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 4, SDLoc(N), MVT::i64); +}]>; +def SImmS16XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64); +}]>; + // simm6sN predicate - True if the immediate is a multiple of N in the range // [-32 * N, 31 * N]. def SImm6s1Operand : SImmScaledMemoryIndexed<6, 1>; @@ -506,27 +519,27 @@ } def simm4s2 : Operand, ImmLeaf=-16 && Imm <= 14 && (Imm % 2) == 0x0; }]> { +[{ return Imm >=-16 && Imm <= 14 && (Imm % 2) == 0x0; }], SImmS2XForm> { let PrintMethod = "printImmScale<2>"; let ParserMatchClass = SImm4s2Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s3 : Operand, ImmLeaf=-24 && Imm <= 21 && (Imm % 3) == 0x0; }]> { +[{ return Imm >=-24 && Imm <= 21 && (Imm % 3) == 0x0; }], SImmS3XForm> { let PrintMethod = "printImmScale<3>"; let ParserMatchClass = SImm4s3Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s4 : Operand, ImmLeaf=-32 && Imm <= 28 && (Imm % 4) == 0x0; }]> { +[{ return Imm >=-32 && Imm <= 28 && (Imm % 4) == 0x0; }], SImmS4XForm> { let PrintMethod = "printImmScale<4>"; let ParserMatchClass = SImm4s4Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s16 : Operand, ImmLeaf=-128 && Imm <= 112 && (Imm % 16) == 0x0; }]> { +[{ return Imm >=-128 && Imm <= 112 && (Imm % 16) == 0x0; }], SImmS16XForm> { let PrintMethod = "printImmScale<16>"; let ParserMatchClass = SImm4s16Operand; let DecoderMethod = "DecodeSImm<4>"; Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -20,12 +20,19 @@ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; +def SDT_AArch64_LD1RQ : SDTypeProfile<1, 2, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + def AArch64ldnf1 : SDNode<"AArch64ISD::LDNF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ldff1 : SDNode<"AArch64ISD::LDFF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ldnf1s : SDNode<"AArch64ISD::LDNF1S", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ldff1s : SDNode<"AArch64ISD::LDFF1S", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1rq : SDNode<"AArch64ISD::LD1RQ", SDT_AArch64_LD1RQ, [SDNPHasChain, SDNPMayLoad]>; + // Gather loads - node definitions // def SDT_AArch64_GATHER_SV : SDTypeProfile<1, 4, [ @@ -1303,6 +1310,25 @@ def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)), (PTEST_PP PPR:$pg, PPR:$src)>; + // LD1R of 128-bit masked data + def : Pat<(nxv16i8 (AArch64ld1rq PPR:$gp, GPR64:$base)), + (LD1RQ_B_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv8i16 (AArch64ld1rq PPR:$gp, GPR64:$base)), + (LD1RQ_H_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv4i32 (AArch64ld1rq PPR:$gp, GPR64:$base)), + (LD1RQ_W_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv2i64 (AArch64ld1rq PPR:$gp, GPR64:$base)), + (LD1RQ_D_IMM $gp, $base, (i64 0))>; + + def : Pat<(nxv16i8 (AArch64ld1rq PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv8i16 (AArch64ld1rq PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv4i32 (AArch64ld1rq PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv2i64 (AArch64ld1rq PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; Index: llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll @@ -1,6 +1,141 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s ; +; LD1RQB +; + +define @ld1rqb_i8( %pred, i8* %addr) { +; CHECK-LABEL: ld1rqb_i8: +; CHECK: ld1rqb { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv16i8( %pred, i8* %addr) + ret %res +} + +define @ld1rqb_i8_imm( %pred, i8* %addr) { +; CHECK-LABEL: ld1rqb_i8_imm: +; CHECK: ld1rqb { z0.b }, p0/z, [x0, #16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i8 16 + %res = call @llvm.aarch64.sve.ld1rq.nxv16i8( %pred, i8* %ptr) + ret %res +} + +; +; LD1RQH +; + +define @ld1rqh_i16( %pred, i16* %addr) { +; CHECK-LABEL: ld1rqh_i16: +; CHECK: ld1rqh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv8i16( %pred, i16* %addr) + ret %res +} + +define @ld1rqh_f16( %pred, half* %addr) { +; CHECK-LABEL: ld1rqh_f16: +; CHECK: ld1rqh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv8f16( %pred, half* %addr) + ret %res +} + +define @ld1rqh_i16_imm( %pred, i16* %addr) { +; CHECK-LABEL: ld1rqh_i16_imm: +; CHECK: ld1rqh { z0.h }, p0/z, [x0, #-64] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i16 -32 + %res = call @llvm.aarch64.sve.ld1rq.nxv8i16( %pred, i16* %ptr) + ret %res +} + +define @ld1rqh_f16_imm( %pred, half* %addr) { +; CHECK-LABEL: ld1rqh_f16_imm: +; CHECK: ld1rqh { z0.h }, p0/z, [x0, #-16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i16 -8 + %res = call @llvm.aarch64.sve.ld1rq.nxv8f16( %pred, half* %ptr) + ret %res +} + +; +; LD1RQW +; + +define @ld1rqw_i32( %pred, i32* %addr) { +; CHECK-LABEL: ld1rqw_i32: +; CHECK: ld1rqw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv4i32( %pred, i32* %addr) + ret %res +} + +define @ld1rqw_f32( %pred, float* %addr) { +; CHECK-LABEL: ld1rqw_f32: +; CHECK: ld1rqw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv4f32( %pred, float* %addr) + ret %res +} + +define @ld1rqw_i32_imm( %pred, i32* %addr) { +; CHECK-LABEL: ld1rqw_i32_imm: +; CHECK: ld1rqw { z0.s }, p0/z, [x0, #112] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i32 28 + %res = call @llvm.aarch64.sve.ld1rq.nxv4i32( %pred, i32* %ptr) + ret %res +} + +define @ld1rqw_f32_imm( %pred, float* %addr) { +; CHECK-LABEL: ld1rqw_f32_imm: +; CHECK: ld1rqw { z0.s }, p0/z, [x0, #32] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %addr, i32 8 + %res = call @llvm.aarch64.sve.ld1rq.nxv4f32( %pred, float* %ptr) + ret %res +} + +; +; LD1RQD +; + +define @ld1rqd_i64( %pred, i64* %addr) { +; CHECK-LABEL: ld1rqd_i64: +; CHECK: ld1rqd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv2i64( %pred, i64* %addr) + ret %res +} + +define @ld1rqd_f64( %pred, double* %addr) { +; CHECK-LABEL: ld1rqd_f64: +; CHECK: ld1rqd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv2f64( %pred, double* %addr) + ret %res +} + +define @ld1rqd_i64_imm( %pred, i64* %addr) { +; CHECK-LABEL: ld1rqd_i64_imm: +; CHECK: ld1rqd { z0.d }, p0/z, [x0, #64] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i64, i64* %addr, i64 8 + %res = call @llvm.aarch64.sve.ld1rq.nxv2i64( %pred, i64* %ptr) + ret %res +} + +define @ld1rqd_f64_imm( %pred, double* %addr) { +; CHECK-LABEL: ld1rqd_f64_imm: +; CHECK: ld1rqd { z0.d }, p0/z, [x0, #-128] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds double, double* %addr, i64 -16 + %res = call @llvm.aarch64.sve.ld1rq.nxv2f64( %pred, double* %ptr) + ret %res +} + +; ; LDNT1B ; @@ -79,6 +214,14 @@ ret %res } +declare @llvm.aarch64.sve.ld1rq.nxv16i8(, i8*) +declare @llvm.aarch64.sve.ld1rq.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ld1rq.nxv4i32(, i32*) +declare @llvm.aarch64.sve.ld1rq.nxv2i64(, i64*) +declare @llvm.aarch64.sve.ld1rq.nxv8f16(, half*) +declare @llvm.aarch64.sve.ld1rq.nxv4f32(, float*) +declare @llvm.aarch64.sve.ld1rq.nxv2f64(, double*) + declare @llvm.aarch64.sve.ldnt1.nxv16i8(, i8*) declare @llvm.aarch64.sve.ldnt1.nxv8i16(, i16*) declare @llvm.aarch64.sve.ldnt1.nxv4i32(, i32*)