diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1305,6 +1305,7 @@ // def int_aarch64_sve_ld1 : AdvSIMD_1Vec_PredLoad_Intrinsic; +def int_aarch64_sve_ld1ro : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredLoad_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -248,6 +248,7 @@ LD1, LD1S, + LD1RO, LDNF1, LDNF1S, LDFF1, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1458,6 +1458,7 @@ case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE"; case AArch64ISD::LD1: return "AArch64ISD::LD1"; case AArch64ISD::LD1S: return "AArch64ISD::LD1S"; + case AArch64ISD::LD1RO: return "AArch64ISD::LD1RO"; case AArch64ISD::LDNF1: return "AArch64ISD::LDNF1"; case AArch64ISD::LDNF1S: return "AArch64ISD::LDNF1S"; case AArch64ISD::LDFF1: return "AArch64ISD::LDFF1"; @@ -11894,6 +11895,24 @@ return DAG.getMergeValues({ Load, LoadChain }, DL); } +static SDValue performLD1ROCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + EVT LoadVT = VT; + if (VT.isFloatingPoint()) + LoadVT = VT.changeTypeToInteger(); + + SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)}; + SDValue Load = DAG.getNode(AArch64ISD::LD1RO, DL, {LoadVT, MVT::Other}, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (VT.isFloatingPoint()) + Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0)); + + return DAG.getMergeValues({Load, LoadChain}, DL); +} + static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Data = N->getOperand(2); @@ -13485,6 +13504,8 @@ return performLDNT1Combine(N, DAG); case Intrinsic::aarch64_sve_ld1rq: return performLD1RQCombine(N, DAG); + case Intrinsic::aarch64_sve_ld1ro: + return performLD1ROCombine(N, DAG); case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnt1_gather: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -31,12 +31,13 @@ // Contiguous load and replicate - node definitions // -def SDT_AArch64_LD1RQ : SDTypeProfile<1, 2, [ +def SDT_AArch64_LD1Replicate : SDTypeProfile<1, 2, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; -def AArch64ld1rq : SDNode<"AArch64ISD::LD1RQ", SDT_AArch64_LD1RQ, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1rq : SDNode<"AArch64ISD::LD1RQ", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1ro : SDNode<"AArch64ISD::LD1RO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; // Gather loads - node definitions // @@ -1914,10 +1915,10 @@ let Predicates = [HasSVE, HasMatMulFP64] in { defm FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64, int_aarch64_sve_fmmla, nxv2f64>; - defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8>; - defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16>; - defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32>; - defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64>; + defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8, nxv16i8, nxv16i1, AArch64ld1ro>; + defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1, AArch64ld1ro>; + defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1, AArch64ld1ro>; + defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64, nxv2i64, nxv2i1, AArch64ld1ro>; defm LD1RO_B : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8, GPR64NoXZRshifted8>; defm LD1RO_H : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16>; defm LD1RO_W : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7663,7 +7663,7 @@ } multiclass sve_mem_ldor_si sz, string asm, RegisterOperand listty, - ZPRRegOp zprty> { + ZPRRegOp zprty, ValueType Ty, ValueType PredTy, SDNode Ld1ro> { def NAME : sve_mem_ldor_si; def : InstAlias(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; @@ -7671,6 +7671,11 @@ (!cast(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>; + + // Base addressing mode + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)), + (!cast(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>; + } class sve_mem_ldor_ss sz, string asm, RegisterOperand VecList, diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro.ll @@ -0,0 +1,84 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+f64mm -asm-verbose=0 < %s | FileCheck %s + +; +; LD1ROB +; + +define @ld1rob_i8( %pred, i8* %addr) nounwind { +; CHECK-LABEL: ld1rob_i8: +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pred, i8* %addr) + ret %res +} + +; +; LD1ROH +; + +define @ld1roh_i16( %pred, i16* %addr) nounwind { +; CHECK-LABEL: ld1roh_i16: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv8i16( %pred, i16* %addr) + ret %res +} + +define @ld1roh_half( %pred, half* %addr) nounwind { +; CHECK-LABEL: ld1roh_half: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv8f16( %pred, half* %addr) + ret %res +} + +; +; LD1ROW +; + +define @ld1row_i32( %pred, i32* %addr) nounwind { +; CHECK-LABEL: ld1row_i32: +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv4i32( %pred, i32* %addr) + ret %res +} + +define @ld1row_float( %pred, float* %addr) nounwind { +; CHECK-LABEL: ld1row_float: +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv4f32( %pred, float* %addr) + ret %res +} + +; +; LD1ROD +; + +define @ld1rod_i64( %pred, i64* %addr) nounwind { +; CHECK-LABEL: ld1rod_i64: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv2i64( %pred, i64* %addr) + ret %res +} + +define @ld1rod_double( %pred, double* %addr) nounwind { +; CHECK-LABEL: ld1rod_double: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv2f64( %pred, double* %addr) + ret %res +} + +declare @llvm.aarch64.sve.ld1ro.nxv16i8(, i8*) + +declare @llvm.aarch64.sve.ld1ro.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ld1ro.nxv8f16(, half*) + +declare @llvm.aarch64.sve.ld1ro.nxv4i32(, i32*) +declare @llvm.aarch64.sve.ld1ro.nxv4f32(, float*) + +declare @llvm.aarch64.sve.ld1ro.nxv2i64(, i64*) +declare @llvm.aarch64.sve.ld1ro.nxv2f64(, double*)