diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -601,15 +601,12 @@
 
   /// STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised
   /// of a linear sequence of unsigned values starting from 0 with a step of
-  /// IMM, where IMM must be a vector index constant integer value which must
-  /// fit in the vector element type.
-  /// Note that IMM may be a smaller type than the vector element type, in
-  /// which case the step is implicitly sign-extended to the vector element
-  /// type. IMM may also be a larger type than the vector element type, in
-  /// which case the step is implicitly truncated to the vector element type.
+  /// IMM, where IMM must be a TargetConstant with type equal to the vector
+  /// element type. The arithmetic is performed modulo the bitwidth of the
+  /// element.
+  ///
   /// The operation does not support returning fixed-width vectors or
-  /// non-constant operands. If the sequence value exceeds the limit allowed
-  /// for the element type then the values for those lanes are undefined.
+  /// non-constant operands.
   STEP_VECTOR,
 
   /// MULHU/MULHS - Multiply high - Multiply two integers of type iN,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -835,7 +835,7 @@
 
   /// Returns a vector of type ResVT whose elements contain the linear sequence
   ///   <0, Step, Step * 2, Step * 3, ...>
-  SDValue getStepVector(const SDLoc &DL, EVT ResVT, SDValue Step);
+  SDValue getStepVector(const SDLoc &DL, EVT ResVT, APInt StepVal);
 
   /// Returns a vector of type ResVT whose elements contain the linear sequence
   ///   <0, 1, 2, 3, ...>
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2526,8 +2526,7 @@
       N1.getOpcode() == ISD::STEP_VECTOR) {
     const APInt &C0 = N0->getConstantOperandAPInt(0);
     const APInt &C1 = N1->getConstantOperandAPInt(0);
-    EVT SVT = N0.getOperand(0).getValueType();
-    SDValue NewStep = DAG.getConstant(C0 + C1, DL, SVT);
+    APInt NewStep = C0 + C1;
     return DAG.getStepVector(DL, VT, NewStep);
   }
 
@@ -2537,11 +2536,7 @@
       (N1.getOpcode() == ISD::STEP_VECTOR)) {
     const APInt &SV0 = N0.getOperand(1)->getConstantOperandAPInt(0);
     const APInt &SV1 = N1->getConstantOperandAPInt(0);
-    EVT SVT = N1.getOperand(0).getValueType();
-    assert(N1.getOperand(0).getValueType() ==
-               N0.getOperand(1)->getOperand(0).getValueType() &&
-           "Different operand types of STEP_VECTOR.");
-    SDValue NewStep = DAG.getConstant(SV0 + SV1, DL, SVT);
+    APInt NewStep = SV0 + SV1;
     SDValue SV = DAG.getStepVector(DL, VT, NewStep);
     return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), SV);
   }
@@ -3576,8 +3571,7 @@
 
   // canonicalize (sub X, step_vector(C)) to (add X, step_vector(-C))
   if (N1.getOpcode() == ISD::STEP_VECTOR && N1.hasOneUse()) {
-    SDValue NewStep = DAG.getConstant(-N1.getConstantOperandAPInt(0), DL,
-                                      N1.getOperand(0).getValueType());
+    APInt NewStep = -N1.getConstantOperandAPInt(0);
     return DAG.getNode(ISD::ADD, DL, VT, N0,
                        DAG.getStepVector(DL, VT, NewStep));
   }
@@ -3961,9 +3955,7 @@
   if (N0.getOpcode() == ISD::STEP_VECTOR)
     if (ISD::isConstantSplatVector(N1.getNode(), MulVal)) {
       const APInt &C0 = N0.getConstantOperandAPInt(0);
-      EVT SVT = N0.getOperand(0).getValueType();
-      SDValue NewStep = DAG.getConstant(
-          C0 * MulVal.sextOrTrunc(SVT.getSizeInBits()), SDLoc(N), SVT);
+      APInt NewStep = C0 * MulVal;
       return DAG.getStepVector(SDLoc(N), VT, NewStep);
     }
 
@@ -8476,10 +8468,10 @@
   if (N0.getOpcode() == ISD::STEP_VECTOR)
     if (ISD::isConstantSplatVector(N1.getNode(), ShlVal)) {
       const APInt &C0 = N0.getConstantOperandAPInt(0);
-      EVT SVT = N0.getOperand(0).getValueType();
-      SDValue NewStep = DAG.getConstant(
-          C0 << ShlVal.sextOrTrunc(SVT.getSizeInBits()), SDLoc(N), SVT);
-      return DAG.getStepVector(SDLoc(N), VT, NewStep);
+      if (ShlVal.ult(C0.getBitWidth())) {
+        APInt NewStep = C0 << ShlVal;
+        return DAG.getStepVector(SDLoc(N), VT, NewStep);
+      }
     }
 
   return SDValue();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4861,11 +4861,9 @@
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   assert(NOutVT.isVector() && "Type must be promoted to a vector type");
-  EVT NOutElemVT = TLI.getTypeToTransformTo(*DAG.getContext(),
-                                            NOutVT.getVectorElementType());
   APInt StepVal = cast<ConstantSDNode>(N->getOperand(0))->getAPIntValue();
-  SDValue Step = DAG.getConstant(StepVal.getSExtValue(), dl, NOutElemVT);
-  return DAG.getStepVector(dl, NOutVT, Step);
+  return DAG.getStepVector(dl, NOutVT,
+                           StepVal.sext(NOutVT.getScalarSizeInBits()));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1748,19 +1748,21 @@
 }
 
 SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
-  EVT OpVT = TLI->getTypeToTransformTo(*getContext(), ResVT.getScalarType());
-  return getStepVector(DL, ResVT, getConstant(1, DL, OpVT));
+  APInt One(ResVT.getScalarSizeInBits(), 1);
+  return getStepVector(DL, ResVT, One);
 }
 
-SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT, SDValue Step) {
+SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT, APInt StepVal) {
+  assert(ResVT.getScalarSizeInBits() == StepVal.getBitWidth());
   if (ResVT.isScalableVector())
-    return getNode(ISD::STEP_VECTOR, DL, ResVT, Step);
+    return getNode(
+        ISD::STEP_VECTOR, DL, ResVT,
+        getTargetConstant(StepVal, DL, ResVT.getVectorElementType()));
 
-  EVT OpVT = Step.getValueType();
-  APInt StepVal = cast<ConstantSDNode>(Step)->getAPIntValue();
   SmallVector<SDValue, 16> OpsStepConstants;
   for (uint64_t i = 0; i < ResVT.getVectorNumElements(); i++)
-    OpsStepConstants.push_back(getConstant(StepVal * i, DL, OpVT));
+    OpsStepConstants.push_back(
+        getConstant(StepVal * i, DL, ResVT.getVectorElementType()));
   return getBuildVector(ResVT, DL, OpsStepConstants);
 }
 
@@ -4780,14 +4782,9 @@
   case ISD::STEP_VECTOR:
     assert(VT.isScalableVector() &&
            "STEP_VECTOR can only be used with scalable types");
-    assert(VT.getScalarSizeInBits() >= 8 &&
-           "STEP_VECTOR can only be used with vectors of integers that are at "
-           "least 8 bits wide");
-    assert(isa<ConstantSDNode>(Operand) &&
-           cast<ConstantSDNode>(Operand)->getAPIntValue().isSignedIntN(
-               VT.getScalarSizeInBits()) &&
-           "Expected STEP_VECTOR integer constant to fit in "
-           "the vector element type");
+    assert(OpOpcode == ISD::TargetConstant &&
+           VT.getVectorElementType() == Operand.getValueType() &&
+           "Unexpected step operand");
     break;
   case ISD::FREEZE:
     assert(VT == Operand.getValueType() && "Unexpected VT!");
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -4835,6 +4835,14 @@
 //SVE Index Generation Group
 //===----------------------------------------------------------------------===//
 
+def simm5_8b_tgt : TImmLeaf<i8, [{ return (int8_t)Imm >= -16 && (int8_t)Imm < 16; }]>;
+def simm5_16b_tgt : TImmLeaf<i16, [{ return (int16_t)Imm >= -16 && (int16_t)Imm < 16; }]>;
+def simm5_32b_tgt : TImmLeaf<i32, [{ return (int32_t)Imm >= -16 && (int32_t)Imm < 16; }]>;
+def simm5_64b_tgt : TImmLeaf<i64, [{ return (int64_t)Imm >= -16 && (int64_t)Imm < 16; }]>;
+def i64imm_32bit_tgt : TImmLeaf<i64, [{
+  return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
+
 class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
                        Operand imm_ty>
 : I<(outs zprty:$Zd), (ins imm_ty:$imm5, imm_ty:$imm5b),
@@ -4858,23 +4866,23 @@
   def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>;
   def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>;
 
-  def : Pat<(nxv16i8 (step_vector simm5_8b:$imm5b)),
-            (!cast<Instruction>(NAME # "_B") (i32 0), simm5_8b:$imm5b)>;
-  def : Pat<(nxv8i16 (step_vector simm5_16b:$imm5b)),
-            (!cast<Instruction>(NAME # "_H") (i32 0), simm5_16b:$imm5b)>;
-  def : Pat<(nxv4i32 (step_vector simm5_32b:$imm5b)),
+  def : Pat<(nxv16i8 (step_vector simm5_8b_tgt:$imm5b)),
+            (!cast<Instruction>(NAME # "_B") (i32 0), (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
+  def : Pat<(nxv8i16 (step_vector simm5_16b_tgt:$imm5b)),
+            (!cast<Instruction>(NAME # "_H") (i32 0), (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
+  def : Pat<(nxv4i32 (step_vector simm5_32b_tgt:$imm5b)),
             (!cast<Instruction>(NAME # "_S") (i32 0), simm5_32b:$imm5b)>;
-  def : Pat<(nxv2i64 (step_vector simm5_64b:$imm5b)),
+  def : Pat<(nxv2i64 (step_vector simm5_64b_tgt:$imm5b)),
             (!cast<Instruction>(NAME # "_D") (i64 0), simm5_64b:$imm5b)>;
 
   // add(step_vector(step), dup(X)) -> index(X, step).
-  def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b:$imm5b)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
-            (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, simm5_8b:$imm5b)>;
-  def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b:$imm5b)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
-            (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, simm5_16b:$imm5b)>;
-  def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b:$imm5b)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
+  def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5b)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
+            (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
+  def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5b)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
+            (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
+  def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5b)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
             (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b:$imm5b)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+  def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5b)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
             (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>;
 }
 
@@ -4901,33 +4909,33 @@
   def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>;
   def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>;
 
-  def : Pat<(nxv16i8 (step_vector (i32 imm:$imm))),
-            (!cast<Instruction>(NAME # "_B") (i32 0), (!cast<Instruction>("MOVi32imm") imm:$imm))>;
-  def : Pat<(nxv8i16 (step_vector (i32 imm:$imm))),
-            (!cast<Instruction>(NAME # "_H") (i32 0), (!cast<Instruction>("MOVi32imm") imm:$imm))>;
-  def : Pat<(nxv4i32 (step_vector (i32 imm:$imm))),
-            (!cast<Instruction>(NAME # "_S") (i32 0), (!cast<Instruction>("MOVi32imm") imm:$imm))>;
-  def : Pat<(nxv2i64 (step_vector (i64 imm:$imm))),
-            (!cast<Instruction>(NAME # "_D") (i64 0), (!cast<Instruction>("MOVi64imm") imm:$imm))>;
-  def : Pat<(nxv2i64 (step_vector (i64 !cast<ImmLeaf>("i64imm_32bit"):$imm))),
-            (!cast<Instruction>(NAME # "_D") (i64 0), (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") imm:$imm)), sub_32))>;
+  def : Pat<(nxv16i8 (step_vector i8:$imm)),
+            (!cast<Instruction>(NAME # "_B") (i32 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+  def : Pat<(nxv8i16 (step_vector i16:$imm)),
+            (!cast<Instruction>(NAME # "_H") (i32 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+  def : Pat<(nxv4i32 (step_vector i32:$imm)),
+            (!cast<Instruction>(NAME # "_S") (i32 0), (!cast<Instruction>("MOVi32imm") $imm))>;
+  def : Pat<(nxv2i64 (step_vector i64:$imm)),
+            (!cast<Instruction>(NAME # "_D") (i64 0), (!cast<Instruction>("MOVi64imm") $imm))>;
+  def : Pat<(nxv2i64 (step_vector i64imm_32bit_tgt:$imm)),
+            (!cast<Instruction>(NAME # "_D") (i64 0), (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
 
   // add(step_vector(step), dup(X)) -> index(X, step).
-  def : Pat<(add (nxv16i8 (step_vector_oneuse (i32 imm:$imm))), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
-            (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, (!cast<Instruction>("MOVi32imm") imm:$imm))>;
-  def : Pat<(add (nxv8i16 (step_vector_oneuse (i32 imm:$imm))), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
-            (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, (!cast<Instruction>("MOVi32imm") imm:$imm))>;
-  def : Pat<(add (nxv4i32 (step_vector_oneuse (i32 imm:$imm))), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
-            (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, (!cast<Instruction>("MOVi32imm") imm:$imm))>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse (i64 imm:$imm))), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
-            (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (!cast<Instruction>("MOVi64imm") imm:$imm))>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse (i64 !cast<ImmLeaf>("i64imm_32bit"):$imm))), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
-            (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") imm:$imm)), sub_32))>;
+  def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
+            (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+  def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
+            (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+  def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
+            (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, (!cast<Instruction>("MOVi32imm") $imm))>;
+  def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+            (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (!cast<Instruction>("MOVi64imm") $imm))>;
+  def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+            (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
 
   // mul(step_vector(1), dup(Y)) -> index(0, Y).
-  def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i32 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))),
+  def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_B") (i32 0), GPR32:$Rm)>;
-  def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i32 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),
+  def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_H") (i32 0), GPR32:$Rm)>;
   def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_S") (i32 0), GPR32:$Rm)>;
@@ -4935,9 +4943,9 @@
             (!cast<Instruction>(NAME # "_D") (i64 0), GPR64:$Rm)>;
 
   // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
-  def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i32 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
             (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>;
-  def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i32 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
             (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>;
   def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
             (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>;
@@ -4969,13 +4977,13 @@
   def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;
 
   // add(step_vector(step), dup(X)) -> index(X, step).
-  def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b:$imm5)), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "_B") GPR32:$Rm, simm5_8b:$imm5)>;
-  def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b:$imm5)), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "_H") GPR32:$Rm, simm5_16b:$imm5)>;
-  def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b:$imm5)), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),
+  def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5)), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "_B") GPR32:$Rm, (!cast<SDNodeXForm>("trunc_imm") $imm5))>;
+  def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5)), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "_H") GPR32:$Rm, (!cast<SDNodeXForm>("trunc_imm") $imm5))>;
+  def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5)), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b:$imm5)), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),
+  def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5)), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),
             (!cast<Instruction>(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>;
 }
 
@@ -5003,21 +5011,21 @@
   def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;
 
   // add(step_vector(step), dup(X)) -> index(X, step).
-  def : Pat<(add (nxv16i8 (step_vector_oneuse (i32 imm:$imm))), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))),
-            (!cast<Instruction>(NAME # "_B") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") imm:$imm))>;
-  def : Pat<(add (nxv8i16 (step_vector_oneuse (i32 imm:$imm))), (nxv8i16 (AArch64dup(i32 GPR32:$Rn)))),
-            (!cast<Instruction>(NAME # "_H") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") imm:$imm))>;
-  def : Pat<(add (nxv4i32 (step_vector_oneuse (i32 imm:$imm))), (nxv4i32 (AArch64dup(i32 GPR32:$Rn)))),
-            (!cast<Instruction>(NAME # "_S") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") imm:$imm))>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse (i64 imm:$imm))), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
-            (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (!cast<Instruction>("MOVi64imm") imm:$imm))>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse (i64 !cast<ImmLeaf>("i64imm_32bit"):$imm))), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
-            (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") imm:$imm)), sub_32))>;
+  def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))),
+            (!cast<Instruction>(NAME # "_B") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+  def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(i32 GPR32:$Rn)))),
+            (!cast<Instruction>(NAME # "_H") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
+  def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(i32 GPR32:$Rn)))),
+            (!cast<Instruction>(NAME # "_S") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") $imm))>;
+  def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
+            (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (!cast<Instruction>("MOVi64imm") $imm))>;
+  def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
+            (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
 
   // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
-  def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i32 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))),
+  def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_B") GPR32:$Rn, GPR32:$Rm)>;
-  def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i32 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),(nxv8i16 (AArch64dup(i32 GPR32:$Rn)))),
+  def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),(nxv8i16 (AArch64dup(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_H") GPR32:$Rn, GPR32:$Rm)>;
   def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),(nxv4i32 (AArch64dup(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_S") GPR32:$Rn, GPR32:$Rm)>;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4070,16 +4070,15 @@
   SDValue StepVec = DAG.getNode(RISCVISD::VID_VL, DL, VT, Mask, VL);
   uint64_t StepValImm = Op.getConstantOperandVal(0);
   if (StepValImm != 1) {
-    assert(Op.getOperand(0).getValueType() == XLenVT &&
-           "Unexpected step value type");
     if (isPowerOf2_64(StepValImm)) {
       SDValue StepVal =
           DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT,
                       DAG.getConstant(Log2_64(StepValImm), DL, XLenVT));
       StepVec = DAG.getNode(ISD::SHL, DL, VT, StepVec, StepVal);
     } else {
-      SDValue StepVal =
-          DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Op.getOperand(0));
+      SDValue StepVal = lowerScalarSplat(
+          DAG.getConstant(StepValImm, DL, VT.getVectorElementType()), VL, VT,
+          DL, DAG, Subtarget);
       StepVec = DAG.getNode(ISD::MUL, DL, VT, StepVec, StepVal);
     }
   }
diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
--- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.experimental.stepvector.nxv1i8()
 
@@ -431,6 +431,43 @@
   ret <vscale x 8 x i64> %3
 }
 
+define <vscale x 8 x i64> @mul_bigimm_stepvector_nxv8i64() {
+; RV32-LABEL: mul_bigimm_stepvector_nxv8i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi a0, zero, 7
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    lui a0, 797989
+; RV32-NEXT:    addi a0, a0, -683
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    vid.v v16
+; RV32-NEXT:    vmul.vv v8, v16, v8
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mul_bigimm_stepvector_nxv8i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV64-NEXT:    vid.v v8
+; RV64-NEXT:    lui a0, 1987
+; RV64-NEXT:    addiw a0, a0, -731
+; RV64-NEXT:    slli a0, a0, 12
+; RV64-NEXT:    addi a0, a0, -683
+; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    ret
+entry:
+  %0 = insertelement <vscale x 8 x i64> poison, i64 33333333333, i32 0
+  %1 = shufflevector <vscale x 8 x i64> %0, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %2 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
+  %3 = mul <vscale x 8 x i64> %2, %1
+  ret <vscale x 8 x i64> %3
+}
+
+
 define <vscale x 8 x i64> @shl_stepvector_nxv8i64() {
 ; CHECK-LABEL: shl_stepvector_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
@@ -449,27 +486,59 @@
 declare <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
 
 define <vscale x 16 x i64> @stepvector_nxv16i64() {
-; CHECK-LABEL: stepvector_nxv16i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vadd.vx v16, v8, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: stepvector_nxv16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    vadd.vv v16, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: stepvector_nxv16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; RV64-NEXT:    vid.v v8
+; RV64-NEXT:    vadd.vx v16, v8, a0
+; RV64-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
   ret <vscale x 16 x i64> %v
 }
 
 define <vscale x 16 x i64> @add_stepvector_nxv16i64() {
-; CHECK-LABEL: add_stepvector_nxv16i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vsll.vi v8, v8, 1
-; CHECK-NEXT:    vadd.vx v16, v8, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: add_stepvector_nxv16i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    vsll.vi v8, v8, 1
+; RV32-NEXT:    vadd.vv v16, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: add_stepvector_nxv16i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; RV64-NEXT:    vid.v v8
+; RV64-NEXT:    vsll.vi v8, v8, 1
+; RV64-NEXT:    vadd.vx v16, v8, a0
+; RV64-NEXT:    ret
 entry:
   %0 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
   %1 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
@@ -478,17 +547,39 @@
 }
 
 define <vscale x 16 x i64> @mul_stepvector_nxv16i64() {
-; CHECK-LABEL: mul_stepvector_nxv16i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    addi a0, zero, 3
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    vadd.vx v16, v8, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: mul_stepvector_nxv16i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    srli a1, a0, 3
+; RV32-NEXT:    addi a2, zero, 24
+; RV32-NEXT:    mulhu a1, a1, a2
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    slli a1, a0, 1
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    addi a0, zero, 3
+; RV32-NEXT:    vmul.vx v8, v8, a0
+; RV32-NEXT:    vadd.vv v16, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mul_stepvector_nxv16i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV64-NEXT:    vid.v v8
+; RV64-NEXT:    addi a0, zero, 3
+; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a1, a0, 1
+; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    vadd.vx v16, v8, a0
+; RV64-NEXT:    ret
 entry:
   %0 = insertelement <vscale x 16 x i64> poison, i64 3, i32 0
   %1 = shufflevector <vscale x 16 x i64> %0, <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
@@ -497,16 +588,88 @@
   ret <vscale x 16 x i64> %3
 }
 
+define <vscale x 16 x i64> @mul_bigimm_stepvector_nxv16i64() {
+; RV32-LABEL: mul_bigimm_stepvector_nxv16i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi a0, zero, 7
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    lui a0, 797989
+; RV32-NEXT:    addi a0, a0, -683
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    lui a1, 11557
+; RV32-NEXT:    addi a1, a1, -683
+; RV32-NEXT:    mul a1, a0, a1
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    srli a0, a0, 3
+; RV32-NEXT:    addi a1, zero, 62
+; RV32-NEXT:    mul a1, a0, a1
+; RV32-NEXT:    lui a2, 92455
+; RV32-NEXT:    addi a2, a2, -1368
+; RV32-NEXT:    mulhu a0, a0, a2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vid.v v24
+; RV32-NEXT:    vmul.vv v8, v24, v8
+; RV32-NEXT:    vadd.vv v16, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mul_bigimm_stepvector_nxv16i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    lui a1, 1987
+; RV64-NEXT:    addiw a1, a1, -731
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, -683
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, mu
+; RV64-NEXT:    vid.v v8
+; RV64-NEXT:    vmul.vx v8, v8, a1
+; RV64-NEXT:    vadd.vx v16, v8, a0
+; RV64-NEXT:    ret
+entry:
+  %0 = insertelement <vscale x 16 x i64> poison, i64 33333333333, i32 0
+  %1 = shufflevector <vscale x 16 x i64> %0, <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+  %2 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+  %3 = mul <vscale x 16 x i64> %2, %1
+  ret <vscale x 16 x i64> %3
+}
+
 define <vscale x 16 x i64> @shl_stepvector_nxv16i64() {
-; CHECK-LABEL: shl_stepvector_nxv16i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vsll.vi v8, v8, 2
-; CHECK-NEXT:    vadd.vx v16, v8, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: shl_stepvector_nxv16i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    vsll.vi v8, v8, 2
+; RV32-NEXT:    vadd.vv v16, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: shl_stepvector_nxv16i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; RV64-NEXT:    vid.v v8
+; RV64-NEXT:    vsll.vi v8, v8, 2
+; RV64-NEXT:    vadd.vx v16, v8, a0
+; RV64-NEXT:    ret
 entry:
   %0 = insertelement <vscale x 16 x i64> poison, i64 2, i32 0
   %1 = shufflevector <vscale x 16 x i64> %0, <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer