Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1069,4 +1069,28 @@ def int_aarch64_sve_punpkhi : AdvSIMD_SVE_PUNPKHI_Intrinsic; def int_aarch64_sve_punpklo : AdvSIMD_SVE_PUNPKHI_Intrinsic; + +// +// Gather loads: +// - scalar + vector +// - 64 bit unscaled offsets +// + +def int_aarch64_sve_ld1_gather : Intrinsic<[llvm_anyvector_ty], + [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerToElt<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i64_ty>], + [IntrReadMem, IntrArgMemOnly]>; + +// +// Gather loads: +// - scalar + vector +// - 64 bit scaled offsets +// + +def int_aarch64_sve_ld1_gather_index : Intrinsic<[llvm_anyvector_ty], + [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerToElt<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i64_ty>], + [IntrReadMem, IntrArgMemOnly]>; } Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -196,6 +196,10 @@ UUNPKHI, UUNPKLO, + // Unsigned gather loads. + GLD1, + GLD1_SCALED, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, @@ -225,7 +229,6 @@ STZG, ST2G, STZ2G - }; } // end namespace AArch64ISD Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -614,6 +614,7 @@ setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::STORE); @@ -772,7 +773,7 @@ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled // directly. - for (MVT VT : MVT::fixedlen_vector_valuetypes()) { + for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { @@ -831,6 +832,7 @@ } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); + } void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { @@ -1333,6 +1335,8 @@ case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO"; case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI"; case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO"; + case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; + case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; } return nullptr; } @@ -2975,6 +2979,7 @@ return SDValue(); } + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); @@ -11747,6 +11752,90 @@ DAG.getConstant(MinOffset, DL, MVT::i64)); } +static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode) { + EVT RetVT = N->getValueType(0); + assert(RetVT.isScalableVector() && + "Gather loads are only possible for SVE vectors"); + + SDLoc DL(N); + MVT RetElVT = RetVT.getVectorElementType().getSimpleVT(); + unsigned NumElements = AArch64::SVEBitsPerBlock / RetElVT.getSizeInBits(); + + EVT MaxVT = llvm::MVT::getScalableVectorVT(RetElVT, NumElements); + if (RetVT.getSizeInBits().getKnownMinSize() > + MaxVT.getSizeInBits().getKnownMinSize()) + return SDValue(); + + // Depending on the addressing mode, this is either a pointer or a vector of + // pointers (that fits into one register) + const SDValue Base = N->getOperand(3); + // Depending on the addressing mode, this is either a single offset or a + // vector of offsets (that fits into one register) + const SDValue Offset = N->getOperand(4); + + if (!DAG.getTargetLoweringInfo().isTypeLegal(Base.getValueType()) || + !DAG.getTargetLoweringInfo().isTypeLegal(Offset.getValueType())) + return SDValue(); + + // Return value type that is representable in hardware + EVT HwRetVt = RetVT; + switch (RetVT.getVectorNumElements()) { + default: + return SDValue(); + case 16: + HwRetVt = MVT::nxv16i8; + break; + case 8: + HwRetVt = MVT::nxv8i16; + break; + case 4: + HwRetVt = MVT::nxv4i32; + break; + case 2: + HwRetVt = MVT::nxv2i64; + break; + } + + // Keep the original output value type around - this will better inform + // optimisations (e.g. instruction folding when load is followed by + // zext/sext). This is will only be used for ints, so the value for FPs + // doesn't matter. + SDValue OutVT = DAG.getValueType(RetVT); + if (RetVT.isFloatingPoint()) + OutVT = DAG.getValueType(HwRetVt); + + SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other); + SDValue Ops[] = {N->getOperand(0), // Chain + N->getOperand(2), // Pg + Base, Offset, OutVT}; + + SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (RetVT.isInteger() && (RetVT != HwRetVt)) + Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0)); + + // If the original return value was FP, bitcast accordingly. Doing it here + // means that we can avoid adding TableGen patterns for FPs. + if (RetVT.isFloatingPoint()) { + EVT OutFpVT = RetVT; + switch (RetVT.getVectorNumElements()) { + default: + return SDValue(); + case 2: + OutFpVT = MVT::nxv2f64; + break; + case 4: + OutFpVT = MVT::nxv4f32; + break; + } + Load = DAG.getNode(ISD::BITCAST, DL, OutFpVT, Load.getValue(0)); + } + + return DAG.getMergeValues({Load, LoadChain}, DL); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -11833,6 +11922,10 @@ case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: return performNEONPostLDSTCombine(N, DCI, DAG); + case Intrinsic::aarch64_sve_ld1_gather: + return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1); + case Intrinsic::aarch64_sve_ld1_gather_index: + return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED); default: break; } Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -358,6 +358,16 @@ def am_indexedu6s128 : ComplexPattern; def am_indexeds9s128 : ComplexPattern; +def UImmS2XForm : SDNodeXFormgetTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64); +}]>; +def UImmS4XForm : SDNodeXFormgetTargetConstant(N->getZExtValue() / 4, SDLoc(N), MVT::i64); +}]>; +def UImmS8XForm : SDNodeXFormgetTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64); +}]>; + // uimm5sN predicate - True if the immediate is a multiple of N in the range // [0 * N, 32 * N]. def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>; @@ -365,17 +375,20 @@ def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>; def uimm5s2 : Operand, ImmLeaf= 0 && Imm < (32*2) && ((Imm % 2) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }], + UImmS2XForm> { let ParserMatchClass = UImm5s2Operand; let PrintMethod = "printImmScale<2>"; } def uimm5s4 : Operand, ImmLeaf= 0 && Imm < (32*4) && ((Imm % 4) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }], + UImmS4XForm> { let ParserMatchClass = UImm5s4Operand; let PrintMethod = "printImmScale<4>"; } def uimm5s8 : Operand, ImmLeaf= 0 && Imm < (32*8) && ((Imm % 8) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }], + UImmS8XForm> { let ParserMatchClass = UImm5s8Operand; let PrintMethod = "printImmScale<8>"; } Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -10,6 +10,21 @@ // //===----------------------------------------------------------------------===// +def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def SDT_AArch64_GLD1_IMM : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; + +def SVEAddrModeRegReg8 : ComplexPattern", []>; + let Predicates = [HasSVE] in { def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">; @@ -454,33 +469,33 @@ // Gathers using unscaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d] - defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">; - defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">; - defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">; - defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">; - defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">; - defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">; - defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">; - defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">; - defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">; - defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">; - defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">; - defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">; - defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">; - defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">; + defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", null_frag, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", null_frag, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", null_frag, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", null_frag, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", null_frag, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", null_frag, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", null_frag, nxv2i64>; // Gathers using scaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d, lsl #1] - defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", ZPR64ExtLSL16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>; - defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", ZPR64ExtLSL16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", ZPR64ExtLSL16>; - defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", ZPR64ExtLSL32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>; - defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", ZPR64ExtLSL32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", ZPR64ExtLSL32>; - defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", ZPR64ExtLSL64>; - defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", ZPR64ExtLSL64>; + defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", null_frag, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", null_frag, ZPR64ExtLSL16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", null_frag, ZPR64ExtLSL16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", null_frag, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", null_frag, ZPR64ExtLSL32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", null_frag, ZPR64ExtLSL32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", null_frag, ZPR64ExtLSL64, nxv2i64>; // Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw] @@ -1136,6 +1151,13 @@ // 16-element contiguous stores defm : pred_store; + /* multiclass ldff1_gather { */ + /* // base + index */ + /* def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64sp:$base, GPR64:$offset), MemVT)), */ + /* (I PPR:$gp, GPR64sp:$base, GPR64:$offset)>; */ + /* } */ + + /* defm : ldff1_gather; */ } let Predicates = [HasSVE2] in { Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -5238,7 +5238,6 @@ (!cast(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; } - class sve_mem_32b_gld_vi opc, string asm, Operand imm_ty> : I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), asm, "\t$Zt, $Pg/z, [$Zn, $imm5]", @@ -5531,18 +5530,33 @@ } multiclass sve_mem_64b_gld_sv2_64_scaled opc, string asm, - RegisterOperand zprext> { + SDPatternOperator op, + RegisterOperand zprext, ValueType vt> { def _SCALED_REAL : sve_mem_64b_gld_sv; def : InstAlias(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), + (!cast(NAME # _SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } -multiclass sve_mem_64b_gld_vs2_64_unscaled opc, string asm> { +multiclass sve_mem_64b_gld_vs2_64_unscaled opc, string asm, + SDPatternOperator op, ValueType vt> { def _REAL : sve_mem_64b_gld_sv; def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm)>; + } + + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), + (!cast(NAME) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_64b_gld_vi opc, string asm, Operand imm_ty> Index: llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h =================================================================== --- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -643,6 +643,17 @@ }; } // end namespace AArch64II +namespace AArch64 { +// The number of bits in a SVE register is architecturally defined +// to be a multiple of this value. If has this number of bits, +// a vector can be stored in a SVE register without any +// redundant bits. If has this number of bits divided by P, +// a vector is stored in a SVE register by placing index i +// in index i*P of a vector. The other elements of the +// vector (such as index 1) are undefined. +const unsigned SVEBitsPerBlock = 128; +} // end namespace AArch64 + } // end namespace llvm #endif Index: llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-offset.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-offset.ll @@ -0,0 +1,63 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LD1B, LD1W, LD1H, LD1D: base + 64-bit unscaled offset +; e.g. ld1h { z0.d }, p0/z, [x0, z0.d] +; + +define @gld1b_d( %pg, i8* %base, %b) { +; CHECK-LABEL: gld1b_d: +; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i8( %pg, + i8* %base, + %b) + %res = zext %load to + ret %res +} + +define @gld1h_d( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_d: +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gld1w_d( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gld1w_d: +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i32( %pg, + i32* %base, + %offsets) + %res = zext %load to + ret %res +} + +define @gld1d_d( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_d: +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +declare @llvm.aarch64.sve.ld1.gather.nxv2i8(, i8*, ) +declare @llvm.aarch64.sve.ld1.gather.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ld1.gather.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ld1.gather.nxv2i64(, i64*, ) Index: llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-scaled.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-scaled.ll @@ -0,0 +1,59 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LD1H, LD1W, LD1D: base + 64-bit scaled offset +; e.g. ld1h z0.d, p0/z, [x0, z0.d, lsl #1] +; + +define @gld1h_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_index +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1] +; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gld1w_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gld1w_index +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i32( %pg, + i32* %base, + %b) + %res = zext %load to + ret %res +} + +define @gld1d_index( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_index +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gld1d_index_double( %pg, double* %base, %b) { +; CHECK-LABEL: gld1d_index_double +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +declare @llvm.aarch64.sve.ld1.gather.index.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ld1.gather.index.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ld1.gather.index.nxv2i64(, i64*, ) +declare @llvm.aarch64.sve.ld1.gather.index.nxv2f64(, double*, )