diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVWebGPUTransforms.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVWebGPUTransforms.cpp
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVWebGPUTransforms.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVWebGPUTransforms.cpp
@@ -17,8 +17,13 @@
 #include "mlir/IR/Location.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 
+#include <array>
+#include <cstdint>
+
 namespace mlir {
 namespace spirv {
 #define GEN_PASS_DEF_SPIRVWEBGPUPREPAREPASS
@@ -61,41 +66,62 @@
           loc,
           llvm::formatv("Unexpected integer type for WebGPU: '{0}'", elemTy));
 
-    // Calculate the 'low' and the 'high' result separately, using long
-    // multiplication:
-    //
-    // lhs = [0   0]  [a   b]
-    // rhs = [0   0]  [c   d]
-    // --lhs * rhs--
-    // =     [    a * c    ]   [    b * d    ] +
-    //       [ 0 ]    [a * d + b * c]    [ 0 ]
+    // Emulate 64-bit multiplication by splitting each input element of type i32
+    // into 2 16-bit digits of type i32. This is so that the intermediate
+    // multiplications and additions do not overflow. We extract these 16-bit
+    // digits from i32 vector elements by masking (low digit) and shifting right
+    // (high digit).
     //
-    // ==> high = (a * c) + (a * d + b * c) >> 16
-    Value low = rewriter.create<IMulOp>(loc, lhs, rhs);
-
+    // The multiplication algorithm used is the standard (long) multiplication.
+    // Multiplying two i32 integers produces 64 bits of result, i.e., 4 16-bit
+    // digits. After constant-folding, we end up emitting only 4 multiplications
+    // and 4 additions.
     Value cstLowMask = rewriter.create<ConstantOp>(
         loc, lhs.getType(), getScalarOrSplatAttr(argTy, (1 << 16) - 1));
-    auto getLowHalf = [&rewriter, loc, cstLowMask](Value val) {
+    auto getLowDigit = [&rewriter, loc, cstLowMask](Value val) {
       return rewriter.create<BitwiseAndOp>(loc, val, cstLowMask);
     };
 
     Value cst16 = rewriter.create<ConstantOp>(loc, lhs.getType(),
                                               getScalarOrSplatAttr(argTy, 16));
-    auto getHighHalf = [&rewriter, loc, cst16](Value val) {
+    auto getHighDigit = [&rewriter, loc, cst16](Value val) {
       return rewriter.create<ShiftRightLogicalOp>(loc, val, cst16);
     };
 
-    Value lhsLow = getLowHalf(lhs);
-    Value lhsHigh = getHighHalf(lhs);
-    Value rhsLow = getLowHalf(rhs);
-    Value rhsHigh = getHighHalf(rhs);
-
-    Value high0 = rewriter.create<IMulOp>(loc, lhsHigh, rhsHigh);
-    Value mid = rewriter.create<IAddOp>(
-        loc, rewriter.create<IMulOp>(loc, lhsHigh, rhsLow),
-        rewriter.create<IMulOp>(loc, lhsLow, rhsHigh));
-    Value high1 = getHighHalf(mid);
-    Value high = rewriter.create<IAddOp>(loc, high0, high1);
+    Value cst0 = rewriter.create<ConstantOp>(loc, lhs.getType(),
+                                             getScalarOrSplatAttr(argTy, 0));
+
+    Value lhsLow = getLowDigit(lhs);
+    Value lhsHigh = getHighDigit(lhs);
+    Value rhsLow = getLowDigit(rhs);
+    Value rhsHigh = getHighDigit(rhs);
+
+    std::array<Value, 2> lhsDigits = {lhsLow, lhsHigh};
+    std::array<Value, 2> rhsDigits = {rhsLow, rhsHigh};
+    std::array<Value, 4> resultDigits = {cst0, cst0, cst0, cst0};
+
+    for (auto [i, lhsDigit] : llvm::enumerate(lhsDigits)) {
+      for (auto [j, rhsDigit] : llvm::enumerate(rhsDigits)) {
+        Value &thisResDigit = resultDigits[i + j];
+        Value mul = rewriter.create<IMulOp>(loc, lhsDigit, rhsDigit);
+        Value current = rewriter.createOrFold<IAddOp>(loc, thisResDigit, mul);
+        thisResDigit = getLowDigit(current);
+
+        if (i + j + 1 != resultDigits.size()) {
+          Value &nextResDigit = resultDigits[i + j + 1];
+          Value carry = rewriter.createOrFold<IAddOp>(loc, nextResDigit,
+                                                      getHighDigit(current));
+          nextResDigit = carry;
+        }
+      }
+    }
+
+    auto combineDigits = [loc, cst16, &rewriter](Value low, Value high) {
+      Value highBits = rewriter.create<ShiftLeftLogicalOp>(loc, high, cst16);
+      return rewriter.create<BitwiseOrOp>(loc, low, highBits);
+    };
+    Value low = combineDigits(resultDigits[0], resultDigits[1]);
+    Value high = combineDigits(resultDigits[2], resultDigits[3]);
 
     rewriter.replaceOpWithNewOp<CompositeConstructOp>(
         op, op.getType(), llvm::makeArrayRef({low, high}));
diff --git a/mlir/test/Dialect/SPIRV/Transforms/webgpu-prepare.mlir b/mlir/test/Dialect/SPIRV/Transforms/webgpu-prepare.mlir
--- a/mlir/test/Dialect/SPIRV/Transforms/webgpu-prepare.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/webgpu-prepare.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt --split-input-file --verify-diagnostics --spirv-webgpu-prepare %s | FileCheck %s
+// RUN: mlir-opt --split-input-file --verify-diagnostics \
+// RUN:   --spirv-webgpu-prepare --cse %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.UMulExtended
@@ -10,18 +11,23 @@
 // CHECK-SAME:       ([[ARG0:%.+]]: i32, [[ARG1:%.+]]: i32)
 // CHECK-DAG:        [[CSTMASK:%.+]] = spirv.Constant 65535 : i32
 // CHECK-DAG:        [[CST16:%.+]]   = spirv.Constant 16 : i32
-// CHECK-NEXT:       [[RESLOW:%.+]]  = spirv.IMul [[ARG0]], [[ARG1]] : i32
 // CHECK-NEXT:       [[LHSLOW:%.+]]  = spirv.BitwiseAnd [[ARG0]], [[CSTMASK]] : i32
 // CHECK-NEXT:       [[LHSHI:%.+]]   = spirv.ShiftRightLogical [[ARG0]], [[CST16]] : i32
 // CHECK-NEXT:       [[RHSLOW:%.+]]  = spirv.BitwiseAnd [[ARG1]], [[CSTMASK]] : i32
 // CHECK-NEXT:       [[RHSHI:%.+]]   = spirv.ShiftRightLogical [[ARG1]], [[CST16]] : i32
-// CHECK-DAG:        [[RESHI0:%.+]]  = spirv.IMul [[LHSHI]], [[RHSHI]] : i32
-// CHECK-DAG:        [[MID0:%.+]]    = spirv.IMul [[LHSHI]], [[RHSLOW]] : i32
-// CHECK-DAG:        [[MID1:%.+]]    = spirv.IMul [[LHSLOW]], [[RHSHI]] : i32
-// CHECK-NEXT:       [[MID:%.+]]     = spirv.IAdd [[MID0]], [[MID1]] : i32
-// CHECK-NEXT:       [[RESHI1:%.+]]  = spirv.ShiftRightLogical [[MID]], [[CST16]] : i32
-// CHECK-NEXT:       [[RESHI:%.+]]   = spirv.IAdd [[RESHI0]], [[RESHI1]] : i32
-// CHECK-NEXT:       [[RES:%.+]]     = spirv.CompositeConstruct [[RESLOW]], [[RESHI]] : (i32, i32) -> !spirv.struct<(i32, i32)>
+// CHECK-DAG:                          spirv.IMul [[LHSLOW]], [[RHSLOW]]
+// CHECK-DAG:                          spirv.IMul [[LHSLOW]], [[RHSHI]]
+// CHECK-DAG:                          spirv.IMul [[LHSHI]],  [[RHSLOW]]
+// CHECK-DAG:                          spirv.IMul [[LHSHI]],  [[RHSHI]]
+// CHECK-DAG:                          spirv.IAdd
+// CHECK-DAG:                          spirv.IAdd
+// CHECK-DAG:                          spirv.IAdd
+// CHECK-DAG:                          spirv.IAdd
+// CHECK:                              spirv.ShiftLeftLogical {{%.+}}, [[CST16]] : i32
+// CHECK:                              spirv.BitwiseOr
+// CHECK:                              spirv.ShiftLeftLogical {{%.+}}, [[CST16]] : i32
+// CHECK:                              spirv.BitwiseOr
+// CHECK:            [[RES:%.+]]     = spirv.CompositeConstruct [[RESLO:%.+]], [[RESHI:%.+]] : (i32, i32) -> !spirv.struct<(i32, i32)>
 // CHECK-NEXT:       spirv.ReturnValue [[RES]] : !spirv.struct<(i32, i32)>
 spirv.func @umul_extended_i32(%arg0 : i32, %arg1 : i32) -> !spirv.struct<(i32, i32)> "None" {
   %0 = spirv.UMulExtended %arg0, %arg1 : !spirv.struct<(i32, i32)>
@@ -32,18 +38,23 @@
 // CHECK-SAME:       ([[ARG0:%.+]]: vector<3xi32>, [[ARG1:%.+]]: vector<3xi32>)
 // CHECK-DAG:        [[CSTMASK:%.+]] = spirv.Constant dense<65535> : vector<3xi32>
 // CHECK-DAG:        [[CST16:%.+]]   = spirv.Constant dense<16> : vector<3xi32>
-// CHECK-NEXT:       [[RESLOW:%.+]]  = spirv.IMul [[ARG0]], [[ARG1]] : vector<3xi32>
 // CHECK-NEXT:       [[LHSLOW:%.+]]  = spirv.BitwiseAnd [[ARG0]], [[CSTMASK]] : vector<3xi32>
 // CHECK-NEXT:       [[LHSHI:%.+]]   = spirv.ShiftRightLogical [[ARG0]], [[CST16]] : vector<3xi32>
 // CHECK-NEXT:       [[RHSLOW:%.+]]  = spirv.BitwiseAnd [[ARG1]], [[CSTMASK]] : vector<3xi32>
 // CHECK-NEXT:       [[RHSHI:%.+]]   = spirv.ShiftRightLogical [[ARG1]], [[CST16]] : vector<3xi32>
-// CHECK-DAG:        [[RESHI0:%.+]]  = spirv.IMul [[LHSHI]], [[RHSHI]] : vector<3xi32>
-// CHECK-DAG:        [[MID0:%.+]]    = spirv.IMul [[LHSHI]], [[RHSLOW]] : vector<3xi32>
-// CHECK-DAG:        [[MID1:%.+]]    = spirv.IMul [[LHSLOW]], [[RHSHI]] : vector<3xi32>
-// CHECK-NEXT:       [[MID:%.+]]     = spirv.IAdd [[MID0]], [[MID1]] : vector<3xi32>
-// CHECK-NEXT:       [[RESHI1:%.+]]  = spirv.ShiftRightLogical [[MID]], [[CST16]] : vector<3xi32>
-// CHECK-NEXT:       [[RESHI:%.+]]   = spirv.IAdd [[RESHI0]], [[RESHI1]] : vector<3xi32>
-// CHECK-NEXT:       [[RES:%.+]]     = spirv.CompositeConstruct [[RESLOW]], [[RESHI]]
+// CHECK-DAG:                          spirv.IMul [[LHSLOW]], [[RHSLOW]]
+// CHECK-DAG:                          spirv.IMul [[LHSLOW]], [[RHSHI]]
+// CHECK-DAG:                          spirv.IMul [[LHSHI]],  [[RHSLOW]]
+// CHECK-DAG:                          spirv.IMul [[LHSHI]],  [[RHSHI]]
+// CHECK-DAG:                          spirv.IAdd
+// CHECK-DAG:                          spirv.IAdd
+// CHECK-DAG:                          spirv.IAdd
+// CHECK-DAG:                          spirv.IAdd
+// CHECK:                              spirv.ShiftLeftLogical {{%.+}}, [[CST16]]
+// CHECK:                              spirv.BitwiseOr
+// CHECK:                              spirv.ShiftLeftLogical {{%.+}}, [[CST16]]
+// CHECK:                              spirv.BitwiseOr
+// CHECK-NEXT:       [[RES:%.+]]     = spirv.CompositeConstruct [[RESLOW:%.+]], [[RESHI:%.+]]
 // CHECK-NEXT:       spirv.ReturnValue [[RES]] : !spirv.struct<(vector<3xi32>, vector<3xi32>)>
 spirv.func @umul_extended_vector_i32(%arg0 : vector<3xi32>, %arg1 : vector<3xi32>)
   -> !spirv.struct<(vector<3xi32>, vector<3xi32>)> "None" {
diff --git a/mlir/test/mlir-vulkan-runner/umul_extended.mlir b/mlir/test/mlir-vulkan-runner/umul_extended.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/mlir-vulkan-runner/umul_extended.mlir
@@ -0,0 +1,66 @@
+// Make sure that unsigned extended multiplication produces expected results
+// with and without expansion to primitive mul/add ops for WebGPU.
+
+// RUN: mlir-vulkan-runner %s \
+// RUN:  --shared-libs=%mlir_lib_dir/libvulkan-runtime-wrappers%shlibext,%mlir_lib_dir/libmlir_runner_utils%shlibext \
+// RUN:  --entry-point-result=void | FileCheck %s
+
+// RUN: mlir-vulkan-runner %s --vulkan-runner-spirv-webgpu-prepare \
+// RUN:  --shared-libs=%mlir_lib_dir/libvulkan-runtime-wrappers%shlibext,%mlir_lib_dir/libmlir_runner_utils%shlibext \
+// RUN:  --entry-point-result=void | FileCheck %s
+
+// CHECK: [0, 1, -2,  1, 1048560, -87620295, -131071, -49]
+// CHECK: [0, 0,  1, -2,       0,     65534, -131070,   6]
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<
+    #spirv.vce<v1.4, [Shader], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>
+} {
+  gpu.module @kernels {
+    gpu.func @kernel_add(%arg0 : memref<8xi32>, %arg1 : memref<8xi32>, %arg2 : memref<8xi32>, %arg3 : memref<8xi32>)
+      kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [1, 1, 1]>} {
+      %0 = gpu.block_id x
+      %lhs = memref.load %arg0[%0] : memref<8xi32>
+      %rhs = memref.load %arg1[%0] : memref<8xi32>
+      %low, %hi = arith.mului_extended %lhs, %rhs : i32
+      memref.store %low, %arg2[%0] : memref<8xi32>
+      memref.store %hi, %arg3[%0] : memref<8xi32>
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %buf0 = memref.alloc() : memref<8xi32>
+    %buf1 = memref.alloc() : memref<8xi32>
+    %buf2 = memref.alloc() : memref<8xi32>
+    %buf3 = memref.alloc() : memref<8xi32>
+    %i32_0 = arith.constant 0 : i32
+
+    // Initialize output buffers.
+    %buf4 = memref.cast %buf2 : memref<8xi32> to memref<?xi32>
+    %buf5 = memref.cast %buf3 : memref<8xi32> to memref<?xi32>
+    call @fillResource1DInt(%buf4, %i32_0) : (memref<?xi32>, i32) -> ()
+    call @fillResource1DInt(%buf5, %i32_0) : (memref<?xi32>, i32) -> ()
+
+    %idx_0 = arith.constant 0 : index
+    %idx_1 = arith.constant 1 : index
+    %idx_8 = arith.constant 8 : index
+
+    // Initialize input buffers.
+    %lhs_vals = arith.constant dense<[0, 1, -1,  -1,  65535,  65535, -65535,  7]> : vector<8xi32>
+    %rhs_vals = arith.constant dense<[0, 1,  2,  -1,     16,  -1337, -65535, -7]> : vector<8xi32>
+    vector.store %lhs_vals, %buf0[%idx_0] : memref<8xi32>, vector<8xi32>
+    vector.store %rhs_vals, %buf1[%idx_0] : memref<8xi32>, vector<8xi32>
+
+    gpu.launch_func @kernels::@kernel_add
+        blocks in (%idx_8, %idx_1, %idx_1) threads in (%idx_1, %idx_1, %idx_1)
+        args(%buf0 : memref<8xi32>, %buf1 : memref<8xi32>, %buf2 : memref<8xi32>, %buf3 : memref<8xi32>)
+    %buf_low = memref.cast %buf4 : memref<?xi32> to memref<*xi32>
+    %buf_hi = memref.cast %buf5 : memref<?xi32> to memref<*xi32>
+    call @printMemrefI32(%buf_low) : (memref<*xi32>) -> ()
+    call @printMemrefI32(%buf_hi) : (memref<*xi32>) -> ()
+    return
+  }
+  func.func private @fillResource1DInt(%0 : memref<?xi32>, %1 : i32)
+  func.func private @printMemrefI32(%ptr : memref<*xi32>)
+}
diff --git a/mlir/tools/mlir-vulkan-runner/CMakeLists.txt b/mlir/tools/mlir-vulkan-runner/CMakeLists.txt
--- a/mlir/tools/mlir-vulkan-runner/CMakeLists.txt
+++ b/mlir/tools/mlir-vulkan-runner/CMakeLists.txt
@@ -74,6 +74,8 @@
     MLIRTargetLLVMIRExport
     MLIRTransforms
     MLIRTranslateLib
+    MLIRVectorDialect
+    MLIRVectorToLLVM
     ${Vulkan_LIBRARY}
   )
 
diff --git a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
--- a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
+++ b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
@@ -13,12 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
-#include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h"
 #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h"
 #include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -30,18 +30,28 @@
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/ExecutionEngine/JitRunner.h"
-#include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Export.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/TargetSelect.h"
 
 using namespace mlir;
 
-static LogicalResult runMLIRPasses(Operation *op, JitRunnerOptions &options) {
+namespace {
+struct VulkanRunnerOptions {
+  llvm::cl::OptionCategory category{"mlir-vulkan-runner options"};
+  llvm::cl::opt<bool> spirvWebGPUPrepare{
+      "vulkan-runner-spirv-webgpu-prepare",
+      llvm::cl::desc("Run MLIR transforms used when targetting WebGPU"),
+      llvm::cl::cat(category)};
+};
+} // namespace
+
+static LogicalResult runMLIRPasses(Operation *op,
+                                   VulkanRunnerOptions &options) {
   auto module = dyn_cast<ModuleOp>(op);
   if (!module)
     return op->emitOpError("expected a 'builtin.module' op");
@@ -55,10 +65,13 @@
   OpPassManager &modulePM = passManager.nest<spirv::ModuleOp>();
   modulePM.addPass(spirv::createLowerABIAttributesPass());
   modulePM.addPass(spirv::createUpdateVersionCapabilityExtensionPass());
+  if (options.spirvWebGPUPrepare)
+    modulePM.addPass(spirv::createSPIRVWebGPUPreparePass());
 
   passManager.addPass(createConvertGpuLaunchFuncToVulkanLaunchFuncPass());
   LowerToLLVMOptions llvmOptions(module.getContext(), DataLayout(module));
   passManager.addPass(createMemRefToLLVMConversionPass());
+  passManager.addPass(createConvertVectorToLLVMPass());
   passManager.nest<func::FuncOp>().addPass(LLVM::createRequestCWrappersPass());
   passManager.addPass(createConvertFuncToLLVMPass(llvmOptions));
   passManager.addPass(createReconcileUnrealizedCastsPass());
@@ -75,13 +88,21 @@
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
 
+  // Initialize runner-specific CLI options. These will be parsed and
+  // initialzied in `JitRunnerMain`.
+  VulkanRunnerOptions options;
+  auto runPassesWithOptions = [&options](Operation *op, JitRunnerOptions &) {
+    return runMLIRPasses(op, options);
+  };
+
   mlir::JitRunnerConfig jitRunnerConfig;
-  jitRunnerConfig.mlirTransformer = runMLIRPasses;
+  jitRunnerConfig.mlirTransformer = runPassesWithOptions;
 
   mlir::DialectRegistry registry;
   registry.insert<mlir::arith::ArithDialect, mlir::LLVM::LLVMDialect,
                   mlir::gpu::GPUDialect, mlir::spirv::SPIRVDialect,
-                  mlir::func::FuncDialect, mlir::memref::MemRefDialect>();
+                  mlir::func::FuncDialect, mlir::memref::MemRefDialect,
+                  mlir::vector::VectorDialect>();
   mlir::registerLLVMDialectTranslation(registry);
 
   return mlir::JitRunnerMain(argc, argv, registry, jitRunnerConfig);
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7303,6 +7303,8 @@
         ":SPIRVDialect",
         ":SPIRVTransforms",
         ":ToLLVMIRTranslation",
+        ":VectorDialect",
+        ":VectorToLLVM",
         "//llvm:Support",
     ],
 )