diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -2464,33 +2464,66 @@
   let hasVerifier = 1;
 }
 
+def PrintPunctuation : I32EnumAttr<"PrintPunctuation",
+                                  "Punctuation for separating vectors or vector elements", [
+  I32EnumAttrCase<"NoPunctuation", 0, "no_punctuation">,
+  I32EnumAttrCase<"NewLine", 1, "newline">,
+  I32EnumAttrCase<"Comma", 2, "comma">,
+  I32EnumAttrCase<"Open", 3, "open">,
+  I32EnumAttrCase<"Close", 4, "close">
+]> {
+  let cppNamespace = "::mlir::vector";
+  let genSpecializedAttr = 0;
+}
+
+def Vector_PrintPunctuation : EnumAttr<Vector_Dialect, PrintPunctuation, "punctuation"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
 def Vector_PrintOp :
   Vector_Op<"print", []>,
-  Arguments<(ins Type<Or<[
+  Arguments<(ins Optional<Type<Or<[
     AnyVectorOfAnyRank.predicate,
     AnyInteger.predicate, Index.predicate, AnyFloat.predicate
-  ]>>:$source)> {
+  ]>>>:$source, DefaultValuedAttr<Vector_PrintPunctuation,
+                      "::mlir::vector::PrintPunctuation::NewLine">:$punctuation)
+  > {
   let summary = "print operation (for testing and debugging)";
   let description = [{
-    Prints the source vector (or scalar) to stdout in human readable
-    format (for testing and debugging). No return value.
+    Prints the source vector (or scalar) to stdout in a human-readable format
+    (for testing and debugging). No return value.
 
     Example:
 
     ```mlir
-    %0 = arith.constant 0.0 : f32
-    %1 = vector.broadcast %0 : f32 to vector<4xf32>
-    vector.print %1 : vector<4xf32>
+    %v = arith.constant dense<0.0> : vector<4xf32>
+    vector.print %v : vector<4xf32>
+    ```
 
-    when lowered to LLVM, the vector print is unrolled into
-    elementary printing method calls that at runtime will yield
+    When lowered to LLVM, the vector print is decomposed into elementary
+    printing method calls that at runtime will yield:
 
+    ```
     ( 0.0, 0.0, 0.0, 0.0 )
+    ```
+
+    This is printed to stdout via a small runtime support library, which only
+    needs to provide a few printing methods (single value for all data
+    types, opening/closing bracket, comma, newline).
+
+    By default `vector.print` adds a newline after the vector, but this can be
+    controlled by the `punctuation` attribute. For example, to print a comma
+    after instead do:
 
-    on stdout when linked with a small runtime support library,
-    which only needs to provide a few printing methods (single
-    value for all data types, opening/closing bracket, comma,
-    newline).
+    ```mlir
+    vector.print %v : vector<4xf32> punctuation <comma>
+    ```
+
+    Note that it is possible to use the punctuation attribute alone. The
+    following will print a single newline:
+
+    ```mlir
+    vector.print punctuation <newline>
     ```
   }];
   let extraClassDeclaration = [{
@@ -2498,7 +2531,13 @@
       return getSource().getType();
     }
   }];
-  let assemblyFormat = "$source attr-dict `:` type($source)";
+  let builders = [
+    OpBuilder<(ins "PrintPunctuation":$punctuation), [{
+      build($_builder, $_state, {}, punctuation);
+    }]>,
+  ];
+
+  let assemblyFormat = "($source^ `:` type($source))? (`punctuation` $punctuation^)? attr-dict";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -28,13 +28,6 @@
 using namespace mlir;
 using namespace mlir::vector;
 
-// Helper to reduce vector type by one rank at front.
-static VectorType reducedVectorTypeFront(VectorType tp) {
-  assert((tp.getRank() > 1) && "unlowerable vector type");
-  return VectorType::get(tp.getShape().drop_front(), tp.getElementType(),
-                         tp.getScalableDims().drop_front());
-}
-
 // Helper to reduce vector type by *all* but one rank at back.
 static VectorType reducedVectorTypeBack(VectorType tp) {
   assert((tp.getRank() > 1) && "unlowerable vector type");
@@ -1416,45 +1409,92 @@
 public:
   using ConvertOpToLLVMPattern<vector::PrintOp>::ConvertOpToLLVMPattern;
 
-  // Proof-of-concept lowering implementation that relies on a small
-  // runtime support library, which only needs to provide a few
-  // printing methods (single value for all data types, opening/closing
-  // bracket, comma, newline). The lowering fully unrolls a vector
-  // in terms of these elementary printing operations. The advantage
-  // of this approach is that the library can remain unaware of all
-  // low-level implementation details of vectors while still supporting
-  // output of any shaped and dimensioned vector. Due to full unrolling,
-  // this approach is less suited for very large vectors though.
+  // Lowering implementation that relies on a small runtime support library,
+  // which only needs to provide a few printing methods (single value for all
+  // data types, opening/closing bracket, comma, newline). The lowering splits
+  // the vector into elementary printing operations. The advantage of this
+  // approach is that the library can remain unaware of all low-level
+  // implementation details of vectors while still supporting output of any
+  // shaped and dimensioned vector.
+  //
+  // Note: This lowering only handles scalars, n-D vectors are broken into
+  // printing scalars in loops in VectorToSCF.
   //
   // TODO: rely solely on libc in future? something else?
   //
   LogicalResult
   matchAndRewrite(vector::PrintOp printOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    Type printType = printOp.getPrintType();
+    auto parent = printOp->getParentOfType<ModuleOp>();
+    if (!parent)
+      return failure();
+
+    auto loc = printOp->getLoc();
+
+    if (auto value = adaptor.getSource()) {
+      Type printType = printOp.getPrintType();
+      if (isa<VectorType>(printType)) {
+        // Vectors should be broken into elementary print ops in VectorToSCF.
+        return failure();
+      }
+      if (failed(emitScalarPrint(rewriter, parent, loc, printType, value)))
+        return failure();
+    }
+
+    auto punct = printOp.getPunctuation();
+    if (punct != PrintPunctuation::NoPunctuation) {
+      emitCall(rewriter, printOp->getLoc(), [&] {
+        switch (punct) {
+        case PrintPunctuation::Close:
+          return LLVM::lookupOrCreatePrintCloseFn(parent);
+        case PrintPunctuation::Open:
+          return LLVM::lookupOrCreatePrintOpenFn(parent);
+        case PrintPunctuation::Comma:
+          return LLVM::lookupOrCreatePrintCommaFn(parent);
+        case PrintPunctuation::NewLine:
+          return LLVM::lookupOrCreatePrintNewlineFn(parent);
+        default:
+          llvm_unreachable("unexpected punctuation");
+        }
+      }());
+    }
 
+    rewriter.eraseOp(printOp);
+    return success();
+  }
+
+private:
+  enum class PrintConversion {
+    // clang-format off
+    None,
+    ZeroExt64,
+    SignExt64,
+    Bitcast16
+    // clang-format on
+  };
+
+  LogicalResult emitScalarPrint(ConversionPatternRewriter &rewriter,
+                                ModuleOp parent, Location loc, Type printType,
+                                Value value) const {
     if (typeConverter->convertType(printType) == nullptr)
       return failure();
 
     // Make sure element type has runtime support.
     PrintConversion conversion = PrintConversion::None;
-    VectorType vectorType = dyn_cast<VectorType>(printType);
-    Type eltType = vectorType ? vectorType.getElementType() : printType;
-    auto parent = printOp->getParentOfType<ModuleOp>();
     Operation *printer;
-    if (eltType.isF32()) {
+    if (printType.isF32()) {
       printer = LLVM::lookupOrCreatePrintF32Fn(parent);
-    } else if (eltType.isF64()) {
+    } else if (printType.isF64()) {
       printer = LLVM::lookupOrCreatePrintF64Fn(parent);
-    } else if (eltType.isF16()) {
+    } else if (printType.isF16()) {
       conversion = PrintConversion::Bitcast16; // bits!
       printer = LLVM::lookupOrCreatePrintF16Fn(parent);
-    } else if (eltType.isBF16()) {
+    } else if (printType.isBF16()) {
       conversion = PrintConversion::Bitcast16; // bits!
       printer = LLVM::lookupOrCreatePrintBF16Fn(parent);
-    } else if (eltType.isIndex()) {
+    } else if (printType.isIndex()) {
       printer = LLVM::lookupOrCreatePrintU64Fn(parent);
-    } else if (auto intTy = dyn_cast<IntegerType>(eltType)) {
+    } else if (auto intTy = dyn_cast<IntegerType>(printType)) {
       // Integers need a zero or sign extension on the operand
       // (depending on the source type) as well as a signed or
       // unsigned print method. Up to 64-bit is supported.
@@ -1485,86 +1525,24 @@
       return failure();
     }
 
-    // Unroll vector into elementary print calls.
-    int64_t rank = vectorType ? vectorType.getRank() : 0;
-    Type type = vectorType ? vectorType : eltType;
-    emitRanks(rewriter, printOp, adaptor.getSource(), type, printer, rank,
-              conversion);
-    emitCall(rewriter, printOp->getLoc(),
-             LLVM::lookupOrCreatePrintNewlineFn(parent));
-    rewriter.eraseOp(printOp);
-    return success();
-  }
-
-private:
-  enum class PrintConversion {
-    // clang-format off
-    None,
-    ZeroExt64,
-    SignExt64,
-    Bitcast16
-    // clang-format on
-  };
-
-  void emitRanks(ConversionPatternRewriter &rewriter, Operation *op,
-                 Value value, Type type, Operation *printer, int64_t rank,
-                 PrintConversion conversion) const {
-    VectorType vectorType = dyn_cast<VectorType>(type);
-    Location loc = op->getLoc();
-    if (!vectorType) {
-      assert(rank == 0 && "The scalar case expects rank == 0");
-      switch (conversion) {
-      case PrintConversion::ZeroExt64:
-        value = rewriter.create<arith::ExtUIOp>(
-            loc, IntegerType::get(rewriter.getContext(), 64), value);
-        break;
-      case PrintConversion::SignExt64:
-        value = rewriter.create<arith::ExtSIOp>(
-            loc, IntegerType::get(rewriter.getContext(), 64), value);
-        break;
-      case PrintConversion::Bitcast16:
-        value = rewriter.create<LLVM::BitcastOp>(
-            loc, IntegerType::get(rewriter.getContext(), 16), value);
-        break;
-      case PrintConversion::None:
-        break;
-      }
-      emitCall(rewriter, loc, printer, value);
-      return;
-    }
-
-    auto parent = op->getParentOfType<ModuleOp>();
-    emitCall(rewriter, loc, LLVM::lookupOrCreatePrintOpenFn(parent));
-    Operation *printComma = LLVM::lookupOrCreatePrintCommaFn(parent);
-
-    if (rank <= 1) {
-      auto reducedType = vectorType.getElementType();
-      auto llvmType = typeConverter->convertType(reducedType);
-      int64_t dim = rank == 0 ? 1 : vectorType.getDimSize(0);
-      for (int64_t d = 0; d < dim; ++d) {
-        Value nestedVal = extractOne(rewriter, *getTypeConverter(), loc, value,
-                                     llvmType, /*rank=*/0, /*pos=*/d);
-        emitRanks(rewriter, op, nestedVal, reducedType, printer, /*rank=*/0,
-                  conversion);
-        if (d != dim - 1)
-          emitCall(rewriter, loc, printComma);
-      }
-      emitCall(rewriter, loc, LLVM::lookupOrCreatePrintCloseFn(parent));
-      return;
-    }
-
-    int64_t dim = vectorType.getDimSize(0);
-    for (int64_t d = 0; d < dim; ++d) {
-      auto reducedType = reducedVectorTypeFront(vectorType);
-      auto llvmType = typeConverter->convertType(reducedType);
-      Value nestedVal = extractOne(rewriter, *getTypeConverter(), loc, value,
-                                   llvmType, rank, d);
-      emitRanks(rewriter, op, nestedVal, reducedType, printer, rank - 1,
-                conversion);
-      if (d != dim - 1)
-        emitCall(rewriter, loc, printComma);
+    switch (conversion) {
+    case PrintConversion::ZeroExt64:
+      value = rewriter.create<arith::ExtUIOp>(
+          loc, IntegerType::get(rewriter.getContext(), 64), value);
+      break;
+    case PrintConversion::SignExt64:
+      value = rewriter.create<arith::ExtSIOp>(
+          loc, IntegerType::get(rewriter.getContext(), 64), value);
+      break;
+    case PrintConversion::Bitcast16:
+      value = rewriter.create<LLVM::BitcastOp>(
+          loc, IntegerType::get(rewriter.getContext(), 16), value);
+      break;
+    case PrintConversion::None:
+      break;
     }
-    emitCall(rewriter, loc, LLVM::lookupOrCreatePrintCloseFn(parent));
+    emitCall(rewriter, loc, printer, value);
+    return success();
   }
 
   // Helper to emit a call.
diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <numeric>
 #include <optional>
 #include <type_traits>
 
@@ -651,6 +652,171 @@
   }
 };
 
+/// Decompose a n-D PrintOp into a loop of elementary/scalar prints. This allows
+/// printing both 1D scalable vectors and n-D fixed size vectors.
+///
+/// E.g.:
+/// ```
+/// vector.print %v : vector<[4]xi32>
+/// ```
+/// is rewritten to:
+/// ```
+/// %c0 = arith.constant 0 : index
+/// %c4 = arith.constant 4 : index
+/// %c1 = arith.constant 1 : index
+/// %vscale = vector.vscale
+/// %length = arith.muli %vscale, %c4 : index
+/// %lastIndex = arith.subi %length, %c1 : index
+/// vector.print punctuation <open>
+/// scf.for %i = %c0 to %length step %c1 {
+///   %el = vector.extractelement %v[%i : index] : vector<[4]xi32>
+///   vector.print %el : i32 punctuation <no_punctuation>
+///   %notLastIndex = arith.cmpi ult, %i, %lastIndex : index
+///   scf.if %notLastIndex {
+///     vector.print punctuation <comma>
+///   }
+/// }
+/// vector.print punctuation <close>
+/// vector.print
+/// ```
+struct DecomposePrintOpConversion : public VectorToSCFPattern<vector::PrintOp> {
+  using VectorToSCFPattern<vector::PrintOp>::VectorToSCFPattern;
+  LogicalResult matchAndRewrite(vector::PrintOp printOp,
+                                PatternRewriter &rewriter) const override {
+    if (!printOp.getSource())
+      return failure();
+
+    VectorType vectorType = dyn_cast<VectorType>(printOp.getPrintType());
+    if (!vectorType)
+      return failure();
+
+    // Currently >= 2D scalable vectors are not supported.
+    // These can't be lowered to LLVM (as LLVM does not support scalable vectors
+    // of scalable vectors), and due to limitations of current ops can't be
+    // indexed with SSA values or flattened. This may change after
+    // https://reviews.llvm.org/D155034, though there still needs to be a path
+    // for lowering to LLVM.
+    if (vectorType.getRank() > 1 && vectorType.isScalable())
+      return failure();
+
+    auto loc = printOp.getLoc();
+    auto value = printOp.getSource();
+
+    if (auto intTy = dyn_cast<IntegerType>(vectorType.getElementType())) {
+      // Oddly sized integers are (somewhat) buggy on a lot of backends, so to
+      // avoid issues extend them to a more standard size.
+      // https://github.com/llvm/llvm-project/issues/30613
+      auto width = intTy.getWidth();
+      auto legalWidth = llvm::NextPowerOf2(std::max(8u, width) - 1);
+      auto legalIntTy = IntegerType::get(rewriter.getContext(), legalWidth,
+                                         intTy.getSignedness());
+      // arith can only take signless integers, so we must cast back and forth.
+      auto signlessSourceVectorType =
+          vectorType.cloneWith({}, getIntTypeWithSignlessSemantics(intTy));
+      auto signlessTargetVectorType =
+          vectorType.cloneWith({}, getIntTypeWithSignlessSemantics(legalIntTy));
+      auto targetVectorType = vectorType.cloneWith({}, legalIntTy);
+      value = rewriter.create<vector::BitCastOp>(loc, signlessSourceVectorType,
+                                                 value);
+      if (width == 1 || intTy.isUnsigned())
+        value = rewriter.create<arith::ExtUIOp>(loc, signlessTargetVectorType,
+                                                value);
+      else
+        value = rewriter.create<arith::ExtSIOp>(loc, signlessTargetVectorType,
+                                                value);
+      value = rewriter.create<vector::BitCastOp>(loc, targetVectorType, value);
+      vectorType = targetVectorType;
+    }
+
+    auto scalableDimensions = vectorType.getScalableDims();
+    auto shape = vectorType.getShape();
+    constexpr int64_t singletonShape[] = {1};
+    if (vectorType.getRank() == 0)
+      shape = singletonShape;
+
+    if (vectorType.getRank() != 1) {
+      // Flatten n-D vectors to 1D. This is done to allow indexing with a
+      // non-constant value (which can currently only be done via
+      // vector.extractelement for 1D vectors).
+      auto flatLength = std::accumulate(shape.begin(), shape.end(), 1,
+                                        std::multiplies<int64_t>());
+      auto flatVectorType =
+          VectorType::get({flatLength}, vectorType.getElementType());
+      value = rewriter.create<vector::ShapeCastOp>(loc, flatVectorType, value);
+    }
+
+    vector::PrintOp firstClose;
+    SmallVector<Value, 8> loopIndices;
+    for (unsigned d = 0; d < shape.size(); d++) {
+      // Setup loop bounds and step.
+      Value lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+      Value upperBound = rewriter.create<arith::ConstantIndexOp>(loc, shape[d]);
+      Value step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+      if (!scalableDimensions.empty() && scalableDimensions[d]) {
+        auto vscale = rewriter.create<vector::VectorScaleOp>(
+            loc, rewriter.getIndexType());
+        upperBound = rewriter.create<arith::MulIOp>(loc, upperBound, vscale);
+      }
+      auto lastIndex = rewriter.create<arith::SubIOp>(loc, upperBound, step);
+
+      // Create a loop to print the elements surrounded by parentheses.
+      rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
+      auto loop =
+          rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step);
+      auto printClose = rewriter.create<vector::PrintOp>(
+          loc, vector::PrintPunctuation::Close);
+      if (!firstClose)
+        firstClose = printClose;
+
+      auto loopIdx = loop.getInductionVar();
+      loopIndices.push_back(loopIdx);
+
+      // Print a comma after all but the last element.
+      rewriter.setInsertionPointToStart(loop.getBody());
+      auto notLastIndex = rewriter.create<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::ult, loopIdx, lastIndex);
+      rewriter.create<scf::IfOp>(loc, notLastIndex,
+                                 [&](OpBuilder &builder, Location loc) {
+                                   builder.create<vector::PrintOp>(
+                                       loc, vector::PrintPunctuation::Comma);
+                                   builder.create<scf::YieldOp>(loc);
+                                 });
+
+      rewriter.setInsertionPointToStart(loop.getBody());
+    }
+
+    // Compute the flattened index.
+    // Note: For the > rank 1 vectors this assumes non-scalable.
+    Value flatIndex;
+    auto currentStride = 1;
+    for (int d = shape.size() - 1; d >= 0; d--) {
+      auto stride = rewriter.create<arith::ConstantIndexOp>(loc, currentStride);
+      auto index = rewriter.create<arith::MulIOp>(loc, stride, loopIndices[d]);
+      if (flatIndex)
+        flatIndex = rewriter.create<arith::AddIOp>(loc, flatIndex, index);
+      else
+        flatIndex = index;
+      currentStride *= shape[d];
+    }
+
+    // Print the scalar elements in the inner most loop.
+    auto element =
+        rewriter.create<vector::ExtractElementOp>(loc, value, flatIndex);
+    rewriter.create<vector::PrintOp>(loc, element,
+                                     vector::PrintPunctuation::NoPunctuation);
+
+    rewriter.setInsertionPointAfter(firstClose);
+    rewriter.create<vector::PrintOp>(loc, printOp.getPunctuation());
+    rewriter.eraseOp(printOp);
+    return success();
+  }
+
+  static IntegerType getIntTypeWithSignlessSemantics(IntegerType intTy) {
+    return IntegerType::get(intTy.getContext(), intTy.getWidth(),
+                            IntegerType::Signless);
+  };
+};
+
 /// Progressive lowering of vector transfer ops: Unpack one dimension.
 ///
 /// 1. Unpack one dimension from the current buffer type and cast the buffer
@@ -1280,6 +1446,8 @@
                  lowering_1_d::TransferOp1dConversion<TransferWriteOp>>(
         patterns.getContext(), options);
   }
+  patterns.add<lowering_n_d::DecomposePrintOpConversion>(patterns.getContext(),
+                                                         options);
 }
 
 namespace {
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -1044,57 +1044,6 @@
 
 // -----
 
-func.func @vector_print_vector_0d(%arg0: vector<f32>) {
-  vector.print %arg0 : vector<f32>
-  return
-}
-// CHECK-LABEL: @vector_print_vector_0d(
-// CHECK-SAME: %[[A:.*]]: vector<f32>)
-//       CHECK: %[[T0:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<f32> to vector<1xf32>
-//       CHECK: llvm.call @printOpen() : () -> ()
-//       CHECK: %[[T1:.*]] = llvm.mlir.constant(0 : index) : i64
-//       CHECK: %[[T2:.*]] = llvm.extractelement %[[T0]][%[[T1]] : i64] : vector<1xf32>
-//       CHECK: llvm.call @printF32(%[[T2]]) : (f32) -> ()
-//       CHECK: llvm.call @printClose() : () -> ()
-//       CHECK: llvm.call @printNewline() : () -> ()
-//       CHECK: return
-
-// -----
-
-func.func @vector_print_vector(%arg0: vector<2x2xf32>) {
-  vector.print %arg0 : vector<2x2xf32>
-  return
-}
-// CHECK-LABEL: @vector_print_vector(
-// CHECK-SAME: %[[A:.*]]: vector<2x2xf32>)
-//       CHECK:    %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<2x2xf32> to !llvm.array<2 x vector<2xf32>>
-//       CHECK:    llvm.call @printOpen() : () -> ()
-//       CHECK:    %[[x0:.*]] = llvm.extractvalue %[[VAL_1]][0] : !llvm.array<2 x vector<2xf32>>
-//       CHECK:    llvm.call @printOpen() : () -> ()
-//       CHECK:    %[[x1:.*]] = llvm.mlir.constant(0 : index) : i64
-//       CHECK:    %[[x2:.*]] = llvm.extractelement %[[x0]][%[[x1]] : i64] : vector<2xf32>
-//       CHECK:    llvm.call @printF32(%[[x2]]) : (f32) -> ()
-//       CHECK:    llvm.call @printComma() : () -> ()
-//       CHECK:    %[[x3:.*]] = llvm.mlir.constant(1 : index) : i64
-//       CHECK:    %[[x4:.*]] = llvm.extractelement %[[x0]][%[[x3]] : i64] : vector<2xf32>
-//       CHECK:    llvm.call @printF32(%[[x4]]) : (f32) -> ()
-//       CHECK:    llvm.call @printClose() : () -> ()
-//       CHECK:    llvm.call @printComma() : () -> ()
-//       CHECK:    %[[x5:.*]] = llvm.extractvalue %[[VAL_1]][1] : !llvm.array<2 x vector<2xf32>>
-//       CHECK:    llvm.call @printOpen() : () -> ()
-//       CHECK:    %[[x6:.*]] = llvm.mlir.constant(0 : index) : i64
-//       CHECK:    %[[x7:.*]] = llvm.extractelement %[[x5]][%[[x6]] : i64] : vector<2xf32>
-//       CHECK:    llvm.call @printF32(%[[x7]]) : (f32) -> ()
-//       CHECK:    llvm.call @printComma() : () -> ()
-//       CHECK:    %[[x8:.*]] = llvm.mlir.constant(1 : index) : i64
-//       CHECK:    %[[x9:.*]] = llvm.extractelement %[[x5]][%[[x8]] : i64] : vector<2xf32>
-//       CHECK:    llvm.call @printF32(%[[x9]]) : (f32) -> ()
-//       CHECK:    llvm.call @printClose() : () -> ()
-//       CHECK:    llvm.call @printClose() : () -> ()
-//       CHECK:    llvm.call @printNewline() : () -> ()
-
-// -----
-
 func.func @extract_strided_slice1(%arg0: vector<4xf32>) -> vector<2xf32> {
   %0 = vector.extract_strided_slice %arg0 {offsets = [2], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
   return %0 : vector<2xf32>
diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
--- a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
+++ b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
@@ -546,3 +546,92 @@
 // CHECK:             } else {
 // CHECK:             }
 // CHECK:           }
+
+// -----
+
+func.func @vector_print_vector_0d(%arg0: vector<f32>) {
+  vector.print %arg0 : vector<f32>
+  return
+}
+// CHECK-LABEL:   func.func @vector_print_vector_0d(
+// CHECK-SAME:                                      %[[VEC:.*]]: vector<f32>) {
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<f32> to vector<1xf32>
+// CHECK:           vector.print punctuation <open>
+// CHECK:           scf.for %[[IDX:.*]] = %[[C0]] to %[[C1]] step %[[C1]] {
+// CHECK:             %[[EL:.*]] = vector.extractelement %[[FLAT_VEC]]{{\[}}%[[IDX]] : index] : vector<1xf32>
+// CHECK:             vector.print %[[EL]] : f32 punctuation <no_punctuation>
+// CHECK:             %[[IS_NOT_LAST:.*]] = arith.cmpi ult, %[[IDX]], %[[C0]] : index
+// CHECK:             scf.if %[[IS_NOT_LAST]] {
+// CHECK:               vector.print punctuation <comma>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           vector.print punctuation <close>
+// CHECK:           vector.print
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @vector_print_vector(%arg0: vector<2x2xf32>) {
+  vector.print %arg0 : vector<2x2xf32>
+  return
+}
+// CHECK-LABEL:   func.func @vector_print_vector(
+// CHECK-SAME:                                   %[[VEC:.*]]: vector<2x2xf32>) {
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<2x2xf32> to vector<4xf32>
+// CHECK:           vector.print punctuation <open>
+// CHECK:           scf.for %[[I:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:             vector.print punctuation <open>
+// CHECK:             scf.for %[[J:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:               %[[OUTER_INDEX:.*]] = arith.muli %[[I]], %[[C2]] : index
+// CHECK:               %[[FLAT_INDEX:.*]] = arith.addi %[[J]], %[[OUTER_INDEX]] : index
+// CHECK:               %[[EL:.*]] = vector.extractelement %[[FLAT_VEC]]{{\[}}%[[FLAT_INDEX]] : index] : vector<4xf32>
+// CHECK:               vector.print %[[EL]] : f32 punctuation <no_punctuation>
+// CHECK:               %[[IS_NOT_LAST_J:.*]] = arith.cmpi ult, %[[J]], %[[C1]] : index
+// CHECK:               scf.if %[[IS_NOT_LAST_J]] {
+// CHECK:                 vector.print punctuation <comma>
+// CHECK:               }
+// CHECK:             }
+// CHECK:             vector.print punctuation <close>
+// CHECK:             %[[IS_NOT_LAST_I:.*]] = arith.cmpi ult, %[[I]], %[[C1]] : index
+// CHECK:             scf.if %[[IS_NOT_LAST_I]] {
+// CHECK:               vector.print punctuation <comma>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           vector.print punctuation <close>
+// CHECK:           vector.print
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @vector_print_scalable_vector(%arg0: vector<[4]xi32>) {
+  vector.print %arg0 : vector<[4]xi32>
+  return
+}
+// CHECK-LABEL:   func.func @vector_print_scalable_vector(
+// CHECK-SAME:                                            %[[VEC:.*]]: vector<[4]xi32>) {
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[C4:.*]] = arith.constant 4 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VSCALE:.*]] = vector.vscale
+// CHECK:           %[[UPPER_BOUND:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
+// CHECK:           %[[LAST_INDEX:.*]] = arith.subi %[[UPPER_BOUND]], %[[C1]] : index
+// CHECK:           vector.print punctuation <open>
+// CHECK:           scf.for %[[IDX:.*]] = %[[C0]] to %[[UPPER_BOUND]] step %[[C1]] {
+// CHECK:             %[[EL:.*]] = vector.extractelement %[[VEC]]{{\[}}%[[IDX]] : index] : vector<[4]xi32>
+// CHECK:             vector.print %[[EL]] : i32 punctuation <no_punctuation>
+// CHECK:             %[[IS_NOT_LAST:.*]] = arith.cmpi ult, %[[IDX]], %[[LAST_INDEX]] : index
+// CHECK:             scf.if %[[IS_NOT_LAST]] {
+// CHECK:               vector.print punctuation <comma>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           vector.print punctuation <close>
+// CHECK:           vector.print
+// CHECK:           return
+// CHECK:         }
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
@@ -2,8 +2,9 @@
 // calculations. Emulate i16 ops with i8 ops.
 
 // RUN: mlir-opt %s --test-arith-emulate-wide-int="widest-int-supported=8" \
-// RUN:             --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
-// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
+// RUN:             --convert-vector-to-scf --convert-scf-to-cf --convert-cf-to-llvm \
+// RUN:             --convert-vector-to-llvm --convert-func-to-llvm --convert-arith-to-llvm \
+// RUN:             --reconcile-unrealized-casts | \
 // RUN:   mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:      --shared-libs="%mlir_c_runner_utils,%mlir_runner_utils" | \
 // RUN:   FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-constants-i16.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-constants-i16.mlir
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-constants-i16.mlir
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-constants-i16.mlir
@@ -2,8 +2,8 @@
 // constants and that printing works. Emulate i16 ops with i8 ops.
 
 // RUN: mlir-opt %s --test-arith-emulate-wide-int="widest-int-supported=8" \
-// RUN:             --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
-// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
+// RUN:             --convert-vector-to-scf --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
+// RUN:             --convert-func-to-llvm --convert-arith-to-llvm --reconcile-unrealized-casts | \
 // RUN:   mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:                   --shared-libs=%mlir_c_runner_utils | \
 // RUN:   FileCheck %s --match-full-lines --check-prefix=EMULATED
diff --git a/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir b/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir
--- a/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir
+++ b/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -convert-vector-to-llvm |  \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts |  \
 // RUN: mlir-cpu-runner -e entry_point_with_all_constants -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils
 
 module {
-  llvm.func @function_to_run(%a: vector<8xf32>, %b: vector<8xf32>)  {
+  func.func @function_to_run(%a: vector<8xf32>, %b: vector<8xf32>)  {
     // CHECK: ( 8, 10, 12, 14, 16, 18, 20, 22 )
     %r0 = llvm.inline_asm asm_dialect = intel
         "vaddps $0, $1, $2", "=x,x,x" %a, %b:
@@ -36,21 +36,21 @@
       : vector<8xf32>, vector<8xf32>
     vector.print %r4: vector<8xf32>
 
-    llvm.return
+    return
   }
 
   // Solely exists to prevent inlining and get the expected assembly.
-  llvm.func @entry_point(%a: vector<8xf32>, %b: vector<8xf32>)  {
-    llvm.call @function_to_run(%a, %b) : (vector<8xf32>, vector<8xf32>) -> ()
-    llvm.return
+  func.func @entry_point(%a: vector<8xf32>, %b: vector<8xf32>)  {
+    func.call @function_to_run(%a, %b) : (vector<8xf32>, vector<8xf32>) -> ()
+    return
   }
 
-  llvm.func @entry_point_with_all_constants()  {
+  func.func @entry_point_with_all_constants()  {
     %a = llvm.mlir.constant(dense<[0.0, 1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0]>
       : vector<8xf32>) : vector<8xf32>
     %b = llvm.mlir.constant(dense<[8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]>
       : vector<8xf32>) : vector<8xf32>
-    llvm.call @function_to_run(%a, %b) : (vector<8xf32>, vector<8xf32>) -> ()
-    llvm.return
+    func.call @function_to_run(%a, %b) : (vector<8xf32>, vector<8xf32>) -> ()
+    return
   }
 }
diff --git a/mlir/test/Integration/Dialect/LLVMIR/CPU/test-vp-intrinsic.mlir b/mlir/test/Integration/Dialect/LLVMIR/CPU/test-vp-intrinsic.mlir
--- a/mlir/test/Integration/Dialect/LLVMIR/CPU/test-vp-intrinsic.mlir
+++ b/mlir/test/Integration/Dialect/LLVMIR/CPU/test-vp-intrinsic.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt %s -convert-vector-to-llvm -finalize-memref-to-llvm \
-// RUN:             -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm \
+// RUN: -convert-vector-to-llvm -convert-index-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts | \
 // RUN: mlir-translate -mlir-to-llvmir | \
 // RUN: %lli --entry-function=entry \
 // RUN:      --dlopen=%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
@@ -1,9 +1,9 @@
 //
 // NOTE: this test requires gpu-sm80 and cusparselt
 //
-// DEFINE: %{compile} = mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
+// DEFINE: %{compile} = mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
 // DEFINE: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
-// DEFINE: %s 
+// DEFINE: %s
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:   --shared-libs=%mlir_cuda_runtime \
 // DEFINE:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
@@ -4,7 +4,7 @@
 // RUN: mlir-opt \
 // RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse,gpu.module(gpu-to-cubin{chip=sm_80 features=+ptx71}))" \
 // RUN: %s \
-// RUN: | mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
+// RUN: | mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
 // RUN:            --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-sve.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-sve.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-sve.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-sve.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -lower-affine -convert-scf-to-cf -convert-vector-to-llvm="enable-arm-sve" -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -canonicalize | \
+// RUN: mlir-opt %s -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-arm-sve" -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -canonicalize | \
 // RUN: %mcr_aarch64_cmd -e=entry -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-dot.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-dot.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-dot.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-dot.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-translate --mlir-to-llvmir | \
 // RUN: %lli --entry-function=entry --mattr="avx" --dlopen=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-mask-compress.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-mask-compress.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-mask-compress.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-mask-compress.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts  | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts  | \
 // RUN: mlir-translate  --mlir-to-llvmir | \
 // RUN: %lli --entry-function=entry --mattr="avx512bw" --dlopen=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-rsqrt.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-rsqrt.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-rsqrt.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-rsqrt.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts  | \
 // RUN: mlir-translate --mlir-to-llvmir | \
 // RUN: %lli --entry-function=entry --mattr="avx" --dlopen=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-vp2intersect-i32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-vp2intersect-i32.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-vp2intersect-i32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-vp2intersect-i32.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts  | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts  | \
 // RUN: mlir-translate  --mlir-to-llvmir | \
 // RUN: %lli --entry-function=entry --mattr="avx512bw,avx512vp2intersect" --dlopen=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-0-d-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-0-d-vectors.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-0-d-vectors.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-0-d-vectors.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-broadcast.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-broadcast.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-broadcast.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-broadcast.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-compress.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-compress.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-compress.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-compress.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-constant-mask.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-constant-mask.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-constant-mask.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-constant-mask.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-create-mask-v4i1.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-create-mask-v4i1.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-create-mask-v4i1.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-create-mask-v4i1.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts| \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-create-mask.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-create-mask.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-create-mask.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-create-mask.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-expand.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-expand.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-expand.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-expand.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-extract-strided-slice.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-extract-strided-slice.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-extract-strided-slice.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-extract-strided-slice.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-flat-transpose-col.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-flat-transpose-col.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-flat-transpose-col.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-flat-transpose-col.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=column-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-flat-transpose-row.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-flat-transpose-row.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-flat-transpose-row.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-flat-transpose-row.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=row-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-fma.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-fma.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-fma.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-fma.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-gather.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-gather.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-gather.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-gather.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-index-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-index-vectors.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-index-vectors.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-index-vectors.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-insert-strided-slice.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-insert-strided-slice.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-insert-strided-slice.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-insert-strided-slice.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-maskedload.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-maskedload.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-maskedload.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-maskedload.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-maskedstore.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-maskedstore.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-maskedstore.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-maskedstore.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-matrix-multiply-col.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-matrix-multiply-col.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-matrix-multiply-col.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-matrix-multiply-col.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=column-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-matrix-multiply-row.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-matrix-multiply-row.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-matrix-multiply-row.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-matrix-multiply-row.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=row-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-outerproduct-f32.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-outerproduct-f32.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-outerproduct-i64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-outerproduct-i64.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-outerproduct-i64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-outerproduct-i64.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-print-fp.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-print-fp.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-print-fp.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-print-fp.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-print-int.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-print-int.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-print-int.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-print-int.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-realloc.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-realloc.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-realloc.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-realloc.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts |\
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm='use-aligned-alloc=1' -convert-func-to-llvm -arith-expand -reconcile-unrealized-casts |\
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm='use-aligned-alloc=1' -convert-func-to-llvm -arith-expand -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | FileCheck %s
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f32-reassoc.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f32-reassoc.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f32-reassoc.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f32-reassoc.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf \
 // RUN:             -convert-vector-to-llvm='reassociate-fp-reductions' \
 // RUN:             -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f32.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f32.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f64-reassoc.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f64-reassoc.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f64-reassoc.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f64-reassoc.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf \
 // RUN:             -convert-vector-to-llvm='reassociate-fp-reductions' \
 // RUN:             -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f64.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-f64.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i32.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i32.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i4.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i4.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i4.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i64.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-i64.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-si4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-si4.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-si4.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-si4.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-ui4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-ui4.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-ui4.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-reductions-ui4.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-scan.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-scan.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-scan.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-scan.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -test-vector-scan-lowering -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-vector-scan-lowering -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-scatter.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-scatter.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-scatter.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-scatter.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-shape-cast.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-shape-cast.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-shape-cast.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-shape-cast.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle16x16.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle16x16.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle16x16.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle16x16.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf \
 // RUN:   -test-transform-dialect-interpreter \
 // RUN:   -test-transform-dialect-erase-schedule \
 // RUN:   -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-sparse-dot-matvec.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-sparse-dot-matvec.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-sparse-dot-matvec.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-sparse-dot-matvec.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-sparse-saxpy-jagged-matvec.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-sparse-saxpy-jagged-matvec.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-sparse-saxpy-jagged-matvec.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-sparse-saxpy-jagged-matvec.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transpose.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-transpose.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transpose.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
--- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
+++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\
 // RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\
-// RUN: mlir-opt  -lower-affine -convert-scf-to-cf -convert-vector-to-llvm \
+// RUN: mlir-opt -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm \
 // RUN:  -convert-arith-to-llvm -gpu-kernel-outlining |\
 // RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
 // RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
--- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
+++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
@@ -1,7 +1,7 @@
 // Run the test cases without distributing ops to test default lowering. Run
 // everything on the same thread.
 // RUN: mlir-opt %s -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
-// RUN: mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
 // RUN:  -gpu-kernel-outlining |\
 // RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
 // RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
@@ -14,7 +14,7 @@
 // Run the same test cases with distribution and propagation.
 // RUN: mlir-opt %s  -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" \
 // RUN:   -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
-// RUN: mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
 // RUN:  -gpu-kernel-outlining |\
 // RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
 // RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
@@ -26,7 +26,7 @@
 
 // RUN: mlir-opt %s  -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \
 // RUN:   -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
-// RUN: mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
 // RUN:  -gpu-kernel-outlining |\
 // RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
 // RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
--- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
+++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
@@ -1,7 +1,8 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
 // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/mlir-cpu-runner/X86Vector/math-polynomial-approx-avx2.mlir b/mlir/test/mlir-cpu-runner/X86Vector/math-polynomial-approx-avx2.mlir
--- a/mlir/test/mlir-cpu-runner/X86Vector/math-polynomial-approx-avx2.mlir
+++ b/mlir/test/mlir-cpu-runner/X86Vector/math-polynomial-approx-avx2.mlir
@@ -1,8 +1,10 @@
 // RUN:   mlir-opt %s -test-math-polynomial-approximation="enable-avx2"        \
+// RUN:               -convert-vector-to-scf                                   \
+// RUN:               -convert-scf-to-cf                                       \
 // RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -convert-vector-to-llvm="enable-x86vector"               \
 // RUN:               -convert-math-to-llvm                                    \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -convert-func-to-llvm                                    \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:     -e main -entry-point-result=void -O0                               \
diff --git a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
--- a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
@@ -1,4 +1,4 @@
-// RUN:   mlir-opt %s -pass-pipeline="builtin.module(func.func(test-math-polynomial-approximation,convert-arith-to-llvm),convert-vector-to-llvm,func.func(convert-math-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN:   mlir-opt %s -pass-pipeline="builtin.module(func.func(test-math-polynomial-approximation,convert-arith-to-llvm),convert-vector-to-scf,convert-scf-to-cf,convert-cf-to-llvm,convert-vector-to-llvm,func.func(convert-math-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:     -e main -entry-point-result=void -O0                               \
 // RUN:     -shared-libs=%mlir_c_runner_utils  \
diff --git a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
--- a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
@@ -1,4 +1,4 @@
-// RUN:   mlir-opt %s -pass-pipeline="builtin.module(func.func(test-expand-math,convert-arith-to-llvm),convert-vector-to-llvm,func.func(convert-math-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN:   mlir-opt %s -pass-pipeline="builtin.module(func.func(test-expand-math,convert-arith-to-llvm),convert-vector-to-scf,convert-scf-to-cf,convert-cf-to-llvm,convert-vector-to-llvm,func.func(convert-math-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:     -e main -entry-point-result=void -O0                               \
 // RUN:     -shared-libs=%mlir_c_runner_utils  \
diff --git a/mlir/test/python/dialects/vector.py b/mlir/test/python/dialects/vector.py
--- a/mlir/test/python/dialects/vector.py
+++ b/mlir/test/python/dialects/vector.py
@@ -21,7 +21,7 @@
 
         @func.FuncOp.from_py_func(VectorType.get((12, 5), F32Type.get()))
         def print_vector(arg):
-            return vector.PrintOp(arg)
+            return vector.PrintOp(source=arg)
 
     # CHECK-LABEL: func @print_vector(
     # CHECK-SAME:                     %[[ARG:.*]]: vector<12x5xf32>) {