diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -938,7 +938,6 @@ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSTEP_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp, bool OverrideNEON = false) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -902,6 +902,7 @@ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::VECREDUCE_ADD); + setTargetDAGCombine(ISD::STEP_VECTOR); setTargetDAGCombine(ISD::GlobalAddress); @@ -1151,7 +1152,6 @@ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); - setOperationAction(ISD::STEP_VECTOR, VT, Custom); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); @@ -4476,8 +4476,6 @@ return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::SPLAT_VECTOR: return LowerSPLAT_VECTOR(Op, DAG); - case ISD::STEP_VECTOR: - return LowerSTEP_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::INSERT_SUBVECTOR: @@ -9162,20 +9160,6 @@ return GenerateTBL(Op, ShuffleMask, DAG); } -SDValue AArch64TargetLowering::LowerSTEP_VECTOR(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - EVT VT = Op.getValueType(); - assert(VT.isScalableVector() && - "Only expect scalable vectors for STEP_VECTOR"); - assert(VT.getScalarType() != MVT::i1 && - "Vectors of i1 types not supported for STEP_VECTOR"); - - SDValue StepVal = Op.getOperand(0); - SDValue Zero = DAG.getConstant(0, dl, StepVal.getValueType()); - return DAG.getNode(AArch64ISD::INDEX_VECTOR, dl, VT, Zero, StepVal); -} - SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -9261,9 +9245,7 @@ SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); // create the vector 0,1,0,1,... - SDValue Zero = DAG.getConstant(0, DL, MVT::i64); - SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR, - DL, MVT::nxv2i64, Zero, One); + SDValue SV = DAG.getNode(ISD::STEP_VECTOR, DL, MVT::nxv2i64, One); SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); // create the vector idx64,idx64+1,idx64,idx64+1,... @@ -13665,15 +13647,31 @@ SDLoc DL(N); SDValue Op1 = N->getOperand(1); SDValue Op2 = N->getOperand(2); - EVT ScalarTy = Op1.getValueType(); - - if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) { - Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); - Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); + EVT ScalarTy = Op2.getValueType(); + if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) + ScalarTy = MVT::i32; + + if (isa(Op2)) { + // Lower index_vector(base, step) to step_vector(step) + splat(base). + Op2 = DAG.getConstant((cast(Op2)) + ->getAPIntValue() + .sextOrSelf(ScalarTy.getSizeInBits()), + DL, ScalarTy); + SDValue StepVector = + DAG.getNode(ISD::STEP_VECTOR, DL, N->getValueType(0), Op2); + SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1); + return DAG.getNode(ISD::ADD, DL, N->getValueType(0), StepVector, Base); + } else { + // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base). + SDValue One = DAG.getConstant(1, DL, ScalarTy); + SDValue StepVector = + DAG.getNode(ISD::STEP_VECTOR, DL, N->getValueType(0), One); + SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2); + SDValue Mul = + DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step); + SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1); + return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base); } - - return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0), - Op1, Op2); } static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) { @@ -15463,6 +15461,19 @@ DAG.getConstant(MinOffset, DL, MVT::i64)); } +static SDValue performStepVectorCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (!DCI.isAfterLegalizeDAG()) + return SDValue(); + + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue StepVal = N->getOperand(0); + SDValue Zero = DAG.getConstant(0, DL, StepVal.getValueType()); + return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, VT, Zero, StepVal); +} + // Turns the vector of indices into a vector of byte offstes by scaling Offset // by (BitWidth / 8). static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, @@ -15977,6 +15988,8 @@ return performExtractVectorEltCombine(N, DAG); case ISD::VECREDUCE_ADD: return performVecReduceAddCombine(N, DCI.DAG, Subtarget); + case ISD::STEP_VECTOR: + return performStepVectorCombine(N, DCI, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1370,8 +1370,8 @@ defm INCP_ZP : sve_int_count_v<0b10000, "incp">; defm DECP_ZP : sve_int_count_v<0b10100, "decp">; - defm INDEX_RR : sve_int_index_rr<"index", index_vector, index_vector_oneuse>; - defm INDEX_IR : sve_int_index_ir<"index", index_vector, index_vector_oneuse>; + defm INDEX_RR : sve_int_index_rr<"index", index_vector, index_vector_oneuse, AArch64mul_p_oneuse>; + defm INDEX_IR : sve_int_index_ir<"index", index_vector, index_vector_oneuse, AArch64mul_p, AArch64mul_p_oneuse>; defm INDEX_RI : sve_int_index_ri<"index", index_vector, index_vector_oneuse>; defm INDEX_II : sve_int_index_ii<"index", index_vector, index_vector_oneuse>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -4822,7 +4822,7 @@ let Inst{4-0} = Zd; } -multiclass sve_int_index_ir { +multiclass sve_int_index_ir { def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_8b>; def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_16b>; def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>; @@ -4847,6 +4847,25 @@ def : Pat<(add (nxv2i64 (oneuseop (i64 0), GPR64:$Rm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))), (!cast(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>; + // mul(index_vector(0, 1), dup(Y)) -> index_vector(0, Y). + def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (oneuseop (i32 0), (i32 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), + (!cast(NAME # "_B") (i32 0), GPR32:$Rm)>; + def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (oneuseop (i32 0), (i32 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), + (!cast(NAME # "_H") (i32 0), GPR32:$Rm)>; + def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (oneuseop (i32 0), (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), + (!cast(NAME # "_S") (i32 0), GPR32:$Rm)>; + def : Pat<(mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (oneuseop (i64 0), (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))), + (!cast(NAME # "_D") (i64 0), GPR64:$Rm)>; + + // add(mul(index_vector(0, 1), dup(Y), dup(X)) -> index_vector(X, Y). + def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (oneuseop (i32 0), (i32 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))), + (!cast(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>; + def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (oneuseop (i32 0), (i32 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))), + (!cast(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>; + def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (oneuseop (i32 0), (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))), + (!cast(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>; + def : Pat<(add (muloneuseop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (oneuseop (i64 0), (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))), + (!cast(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>; } class sve_int_index_ri sz8_64, string asm, ZPRRegOp zprty, @@ -4909,7 +4928,7 @@ let Inst{4-0} = Zd; } -multiclass sve_int_index_rr { +multiclass sve_int_index_rr { def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>; def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>; def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>; @@ -4929,6 +4948,16 @@ (!cast(NAME # "_S") GPR32:$Rn, GPR32:$Rm)>; def : Pat<(add (nxv2i64 (oneuseop (i64 0), GPR64:$Rm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))), (!cast(NAME # "_D") GPR64:$Rn, GPR64:$Rm)>; + + // add(mul(index_vector(0, 1), dup(Y), dup(X)) -> index_vector(X, Y). + def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (oneuseop (i32 0), (i32 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))), + (!cast(NAME # "_B") GPR32:$Rn, GPR32:$Rm)>; + def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (oneuseop (i32 0), (i32 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),(nxv8i16 (AArch64dup(i32 GPR32:$Rn)))), + (!cast(NAME # "_H") GPR32:$Rn, GPR32:$Rm)>; + def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (oneuseop (i32 0), (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),(nxv4i32 (AArch64dup(i32 GPR32:$Rn)))), + (!cast(NAME # "_S") GPR32:$Rn, GPR32:$Rm)>; + def : Pat<(add (mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (oneuseop (i64 0), (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),(nxv2i64 (AArch64dup(i64 GPR64:$Rn)))), + (!cast(NAME # "_D") GPR64:$Rn, GPR64:$Rm)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s ; @@ -6,132 +7,175 @@ define @index_ii_i8() { ; CHECK-LABEL: index_ii_i8: -; CHECK: index z0.b, #-16, #15 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.b, #-16, #15 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv16i8(i8 -16, i8 15) ret %out } define @index_ii_i16() { ; CHECK-LABEL: index_ii_i16: -; CHECK: index z0.h, #15, #-16 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.h, #15, #-16 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv8i16(i16 15, i16 -16) ret %out } define @index_ii_i32() { ; CHECK-LABEL: index_ii_i32: -; CHECK: index z0.s, #-16, #15 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.s, #-16, #15 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv4i32(i32 -16, i32 15) ret %out } define @index_ii_i64() { ; CHECK-LABEL: index_ii_i64: -; CHECK: index z0.d, #15, #-16 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.d, #15, #-16 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv2i64(i64 15, i64 -16) ret %out } define @index_ii_range() { ; CHECK-LABEL: index_ii_range: -; CHECK: mov w8, #16 -; CHECK-NEXT: mov x9, #-17 -; CHECK-NEXT: index z0.d, x9, x8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: mov x9, #-17 +; CHECK-NEXT: index z0.d, x9, x8 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv2i64(i64 -17, i64 16) ret %out } +define @index_ii_range_combine(i16 %a) { +; CHECK-LABEL: index_ii_range_combine: +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.h, #2, #8 +; CHECK-NEXT: ret + %val = insertelement poison, i16 2, i32 0 + %val1 = shufflevector %val, poison, zeroinitializer + %val2 = call @llvm.aarch64.sve.index.nxv8i16(i16 0, i16 2) + %val3 = shl %val2, %val1 + %out = add %val3, %val1 + ret %out +} + ; ; INDEX (IMMEDIATE, SCALAR) ; define @index_ir_i8(i8 %a) { ; CHECK-LABEL: index_ir_i8: -; CHECK: index z0.b, #15, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.b, #15, w0 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv16i8(i8 15, i8 %a) ret %out } define @index_ir_i16(i16 %a) { ; CHECK-LABEL: index_ir_i16: -; CHECK: index z0.h, #-16, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.h, #-16, w0 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv8i16(i16 -16, i16 %a) ret %out } define @index_ir_i32(i32 %a) { ; CHECK-LABEL: index_ir_i32: -; CHECK: index z0.s, #15, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.s, #15, w0 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv4i32(i32 15, i32 %a) ret %out } define @index_ir_i64(i64 %a) { ; CHECK-LABEL: index_ir_i64: -; CHECK: index z0.d, #-16, x0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.d, #-16, x0 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv2i64(i64 -16, i64 %a) ret %out } define @index_ir_range(i32 %a) { ; CHECK-LABEL: index_ir_range: -; CHECK: mov w8, #-17 -; CHECK: index z0.s, w8, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-17 +; CHECK-NEXT: index z0.s, w8, w0 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv4i32(i32 -17, i32 %a) ret %out } +define @index_ir_range_combine(i32 %a) { +; CHECK-LABEL: index_ir_range_combine: +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.s, #0, w0 +; CHECK-NEXT: ret + %val = insertelement poison, i32 2, i32 0 + %val1 = shufflevector %val, poison, zeroinitializer + %tmp = call @llvm.aarch64.sve.index.nxv4i32(i32 2, i32 1) + %tmp1 = sub %tmp, %val1 + %val2 = insertelement poison, i32 %a, i32 0 + %val3 = shufflevector %val2, poison, zeroinitializer + %out = mul %tmp1, %val3 + ret %out +} + ; ; INDEX (SCALAR, IMMEDIATE) ; define @index_ri_i8(i8 %a) { ; CHECK-LABEL: index_ri_i8: -; CHECK: index z0.b, w0, #-16 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.b, w0, #-16 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv16i8(i8 %a, i8 -16) ret %out } define @index_ri_i16(i16 %a) { ; CHECK-LABEL: index_ri_i16: -; CHECK: index z0.h, w0, #15 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.h, w0, #15 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv8i16(i16 %a, i16 15) ret %out } define @index_ri_i32(i32 %a) { ; CHECK-LABEL: index_ri_i32: -; CHECK: index z0.s, w0, #-16 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.s, w0, #-16 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv4i32(i32 %a, i32 -16) ret %out } define @index_ri_i64(i64 %a) { ; CHECK-LABEL: index_ri_i64: -; CHECK: index z0.d, x0, #15 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.d, x0, #15 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv2i64(i64 %a, i64 15) ret %out } define @index_ri_range(i16 %a) { ; CHECK-LABEL: index_ri_range: -; CHECK: mov w8, #16 -; CHECK: index z0.h, w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: index z0.h, w0, w8 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv8i16(i16 %a, i16 16) ret %out } @@ -142,36 +186,76 @@ define @index_rr_i8(i8 %a, i8 %b) { ; CHECK-LABEL: index_rr_i8: -; CHECK: index z0.b, w0, w1 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.b, w0, w1 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv16i8(i8 %a, i8 %b) ret %out } define @index_rr_i16(i16 %a, i16 %b) { ; CHECK-LABEL: index_rr_i16: -; CHECK: index z0.h, w0, w1 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.h, w0, w1 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv8i16(i16 %a, i16 %b) ret %out } define @index_rr_i32(i32 %a, i32 %b) { ; CHECK-LABEL: index_rr_i32: -; CHECK: index z0.s, w0, w1 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.s, w0, w1 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv4i32(i32 %a, i32 %b) ret %out } define @index_rr_i64(i64 %a, i64 %b) { ; CHECK-LABEL: index_rr_i64: -; CHECK: index z0.d, x0, x1 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.d, x0, x1 +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv2i64(i64 %a, i64 %b) ret %out } +define @index_rr_i32_combine(i32 %a, i32 %b) { +; CHECK-LABEL: index_rr_i32_combine: +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.s, w0, w1 +; CHECK-NEXT: ret + %val = insertelement poison, i32 %a, i32 0 + %val1 = shufflevector %val, poison, zeroinitializer + %val2 = insertelement poison, i32 %b, i32 0 + %val3 = shufflevector %val2, poison, zeroinitializer + %tmp = call @llvm.aarch64.sve.index.nxv4i32(i32 0, i32 1) + %tmp1 = mul %tmp, %val3 + %out = add %tmp1, %val1 + ret %out +} + +define @index_rr_i32_not_combine(i32 %a, i32 %b) { +; CHECK-LABEL: index_rr_i32_not_combine: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, w0 +; CHECK-NEXT: mov z1.s, w1 +; CHECK-NEXT: index z2.s, #0, #1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mla z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: ret + %val = insertelement poison, i32 %a, i32 0 + %val1 = shufflevector %val, poison, zeroinitializer + %val2 = insertelement poison, i32 %b, i32 0 + %val3 = shufflevector %val2, poison, zeroinitializer + %tmp = call @llvm.aarch64.sve.index.nxv4i32(i32 0, i32 1) + %tmp1 = mul %tmp, %val3 + %tmp2 = add %tmp1, %val1 + %out = add %tmp2, %tmp + ret %out +} + declare @llvm.aarch64.sve.index.nxv16i8(i8, i8) declare @llvm.aarch64.sve.index.nxv8i16(i16, i16) declare @llvm.aarch64.sve.index.nxv4i32(i32, i32)