diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -636,19 +636,21 @@
   Gather
 };
 
-/// Check whether /p val can be used for calculating an index for a contiguous
-/// load operation. This means that /p val should either:
-///   * be invariant with respect to /p linalgOp, or
-///   * increment by 1 with every loop iterator (when /p shouldBeConstant is
-///     false).
-/// Parameters /p trailingLoopDim and /p shouldBeConstant are used to analyze
-/// `linalg.index` ops.
-static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val,
-                                size_t trailingLoopDim, bool shouldBeConstant) {
-  auto *block = linalgOp.getBlock();
+/// Checks whether /p val can be used for calculating a loop invariant index.
+static bool isLoopInvariantIdx(LinalgOp &linalgOp, Value &val) {
 
-  // Bail out if this is a block argument for this linalg.generic Op.
+  auto targetShape = linalgOp.getStaticLoopRanges();
+  assert(((llvm::count_if(targetShape,
+                          [](int64_t dimSize) { return dimSize > 1; }) == 1)) &&
+         "n-D vectors are not yet supported");
+  assert(targetShape.back() != 1 &&
+         "1-D vectors with the trailing dim eqaual 1 are not yet supported");
+
+  // Blocks outside _this_ linalg.generic are effectively loop invariant.
+  // However, analysing block arguments for _this_ linalg.generic Op is a bit
+  // tricky. Just bail out in the latter case.
   // TODO: We could try analysing the corresponding affine map here.
+  auto *block = linalgOp.getBlock();
   if (val.dyn_cast<BlockArgument>())
     return llvm::all_of(block->getArguments(),
                         [&val](Value v) { return (v != val); });
@@ -656,61 +658,90 @@
   Operation *defOp = val.getDefiningOp();
   assert(defOp && "This is neither a block argument nor an operation result");
 
-  // We know that we are reading into a 1-D tensor like this:
-  // `tensor<1x1x4xi32`. Given this assumption, the following Op:
-  //    * `%idx = `linalg.index dim : index`,
-  // will either:
-  //    1. produce a constant when `dim` _is not_ the trailing loop dim, or
-  //    2. increment with stride one when `dim` _is_ the trailing loop dim.
+  // IndexOp is loop invariant as long as its result remains constant across
+  // iterations. Given the assumptions on the loop ranges above, only the
+  // trailing loop dim ever changes.
+  auto trailingLoopDim = linalgOp.getStaticLoopRanges().size() - 1;
   if (auto indexOp = dyn_cast<linalg::IndexOp>(defOp))
-    return shouldBeConstant ? (indexOp.getDim() != trailingLoopDim)
-                            : (indexOp.getDim() == trailingLoopDim);
+    return (indexOp.getDim() != trailingLoopDim);
 
   auto *ancestor = block->findAncestorOpInBlock(*defOp);
 
-  // Values define outside `linalgOp`.
+  // Values define outside `linalgOp` are loop invariant.
   if (!ancestor)
     return true;
 
-  // Values defined inside `linalgOp`, which are constant.
+  // Values defined inside `linalgOp`, which are constant, are loop invariant.
   if (dyn_cast<arith::ConstantOp>(ancestor))
     return true;
 
-  // Conservatively reject Ops that could lead to non-contiguous accesses.
-  if (!isa<arith::AddIOp, arith::SubIOp, linalg::IndexOp>(ancestor))
-    return false;
-
   bool result = true;
   for (auto op : ancestor->getOperands())
-    result &=
-        isContiguousLoadIdx(linalgOp, op, trailingLoopDim, shouldBeConstant);
+    result &= isLoopInvariantIdx(linalgOp, op);
 
   return result;
 }
 
-/// Check whether the calculation of \p val is based on linalg.index Op with
-/// the dim attribute matching \p dim.
-static bool isBasedOnIndexOp(LinalgOp &linalgOp, Value &val, size_t dim) {
-  auto *block = linalgOp.getBlock();
-  auto targetShape = linalgOp.getStaticLoopRanges();
+/// Check whether \p val could be used for calculating the trailing index for a
+/// contiguous load operation.
+///
+/// There are currently 3 types of values that are allowed here:
+///   1. loop-invariant values,
+///   2. values that increment by 1 with every loop iteration,
+///   3. results of basic arithmetic operations (linear and continuous)
+///      involving 1., 2. and 3.
+/// This method returns True if indeed only such values are used in calculating
+/// \p val.
+///
+/// Additionally, the trailing index for a contiguous load operation should
+/// increment by 1 with every loop iteration, i.e. be based on:
+///   * `linalg.index <dim>` ,
+/// where <dim> is the trailing dim of the iteration space. \p foundIndexOp is
+/// updated to `true` when such an op is found.
+static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val,
+                                bool &foundIndexOp) {
 
-  if (val.isa<BlockArgument>())
-    return false;
+  auto targetShape = linalgOp.getStaticLoopRanges();
+  assert(((llvm::count_if(targetShape,
+                          [](int64_t dimSize) { return dimSize > 1; }) == 1)) &&
+         "n-D vectors are not yet supported are not yet supported");
+  assert(targetShape.back() != 1 &&
+         "1-D vectors with the trailing dim 1 are not yet supported");
+
+  // Blocks outside _this_ linalg.generic are effectively loop invariant.
+  // However, analysing block arguments for _this_ linalg.generic Op is a bit
+  // tricky. Just bail out in the latter case.
+  // TODO: We could try analysing the corresponding affine map here.
+  auto *block = linalgOp.getBlock();
+  if (val.dyn_cast<BlockArgument>())
+    return llvm::all_of(block->getArguments(),
+                        [&val](Value v) { return (v != val); });
 
   Operation *defOp = val.getDefiningOp();
   assert(defOp && "This is neither a block argument nor an operation result");
 
-  if (auto indexOp = dyn_cast<linalg::IndexOp>(defOp))
-    return (indexOp.getDim() == dim);
+  // Given the assumption on the loop ranges above, only the trailing loop
+  // index is not constant.
+  auto trailingLoopDim = linalgOp.getStaticLoopRanges().size() - 1;
+  if (auto indexOp = dyn_cast<linalg::IndexOp>(defOp)) {
+    foundIndexOp = (indexOp.getDim() == trailingLoopDim);
+    return true;
+  }
 
   auto *ancestor = block->findAncestorOpInBlock(*defOp);
 
   if (!ancestor)
     return false;
 
+  // Conservatively reject Ops that could lead to indices with stride other
+  // than 1.
+  if (!isa<arith::AddIOp, arith::SubIOp, arith::ConstantOp, linalg::IndexOp>(
+          ancestor))
+    return false;
+
   bool result = false;
   for (auto op : ancestor->getOperands())
-    result |= isBasedOnIndexOp(linalgOp, op, dim);
+    result |= isContiguousLoadIdx(linalgOp, op, foundIndexOp);
 
   return result;
 }
@@ -725,7 +756,7 @@
 
   auto targetShape = linalgOp.getStaticLoopRanges();
 
-  // Assume that it's a gather load when reading _into_:
+  // 1. Assume that it's a gather load when reading _into_:
   //    * an n-D vector, like`tensor<1x2x4xi32` or`tensor<2x1x4xi32>`, or
   //    * a 1-D vector with the trailing dim equal 1, e.g. `tensor<1x4x1xi32`.
   // TODO: Relax these conditions.
@@ -736,44 +767,36 @@
 
   auto inputShape = extractOp.getTensor().getType().cast<ShapedType>();
 
-  // Assume that it's a gather load when reading _from_ a tensor for which the
-  // trailing dimension is 1, e.g. `tensor<1x4x1xi32>`.
+  // 2. Assume that it's a gather load when reading _from_ a tensor for which
+  // the trailing dimension is 1, e.g. `tensor<1x4x1xi32>`.
   // TODO: Relax this condition.
   if (inputShape.getShape().back() == 1)
     return VectorMemoryAccessKind::Gather;
 
-  // The trailing loop dim is needed when analyzing ops like:
-  //     * %idx = `linalg.index <dim> : index`.
-  auto trailingLoopDim = targetShape.size() - 1;
-
   bool isContiguous = true;
 
-  // Iterate over all indices. Analyze the way each index is calculated and
-  // decide whether it is suitable for a contiguous load (e.g. loop invariant).
+  // 3a. Analyze the leading indices of `extractOp`.
+  // Look at the way each index is calculated and decide whether it is suitable
+  // for a contiguous load, i.e. whether it's loop invariant.
   auto indices = extractOp.getIndices();
-  for (auto [i, indexVal] : llvm::enumerate(indices)) {
-    if (inputShape.getShape()[i] == 1) {
-      // This index will always be equal 0, so it is a loop-invariant constant.
-      continue;
-    }
+  auto leadIndices = ValueRange(indices.drop_back(1));
 
-    // Should this index be loop invariant?
-    //  * _no_ if this is the trailing index,
-    //  * _yes_ otherwise.
-    auto extractOpBottomIdx = indices.size() - 1;
-    bool loopInvariantIndex = (i != extractOpBottomIdx);
+  for (auto [i, indexVal] : llvm::enumerate(leadIndices)) {
+    if (inputShape.getShape()[i] == 1)
+      continue;
 
-    isContiguous &= isContiguousLoadIdx(linalgOp, indexVal, trailingLoopDim,
-                                        loopInvariantIndex);
+    isContiguous &= isLoopInvariantIdx(linalgOp, indexVal);
   }
 
-  // The trailing index in the extract Op must increment with every iteration,
-  // which means that it must be based on a loop index. Given the assumption
-  // on the output tensor, only the trailing loop index is not constant, so
-  // that's what we need to check against.
+  // 3b. Analyze the trailing index for `extractOp`.
   auto extractOpTrailingIdx = indices.back();
+  // For contiguous loads, the trailing `extractOp` index should increment with
+  // every loop iteration. This effectively means that it must be based on the
+  // trailing loop index. This is what the following bool captures.
+  bool foundIndexOp = false;
   isContiguous &=
-      isBasedOnIndexOp(linalgOp, extractOpTrailingIdx, trailingLoopDim);
+      isContiguousLoadIdx(linalgOp, extractOpTrailingIdx, foundIndexOp);
+  isContiguous &= foundIndexOp;
 
   if (isContiguous) {
     LDBG("Found contigous load: " << extractOp);
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -1783,6 +1783,211 @@
 // CHECK:           %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_8]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32>
 // CHECK:           return %[[VAL_14]] : tensor<5xf32>
 
+transform.sequence failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+   %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+   %2 = transform.structured.vectorize %1 { vectorize_nd_extract }
+ }
+
+// -----
+
+// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Contiguous load.
+func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
+  %c79 = arith.constant 79 : index
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%extracted_slice : tensor<1x4xf32>) {
+  ^bb0(%out: f32):
+    %2 = linalg.index 1 : index
+    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0)
+    %extracted = tensor.extract %6[%c79, %3] : tensor<80x16xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x4xf32>
+  return %1 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(
+// CHECK-SAME:                                                                        %[[VAL_0:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:                                                                        %[[VAL_1:.*]]: index,
+// CHECK-SAME:                                                                        %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
+// CHECK:           %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 79 : index
+// CHECK:           %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex>
+// CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex>
+// CHECK:           %[[VAL_10:.*]] = vector.extractelement %[[VAL_9]]{{\[}}%[[VAL_4]] : i32] : vector<4xindex>
+// CHECK:           %[[VAL_11:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_10]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32>
+// CHECK:           %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+// CHECK:           return %[[VAL_12]] : tensor<1x4xf32>
+// CHECK:         }
+
+transform.sequence failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+   %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+   %2 = transform.structured.vectorize %1 { vectorize_nd_extract }
+ }
+
+// -----
+
+// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Gather load.
+func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
+  %c16 = arith.constant 16 : index
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%extracted_slice : tensor<1x4xf32>) {
+  ^bb0(%out: f32):
+    %2 = linalg.index 1 : index
+    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0)
+    %extracted = tensor.extract %6[%3, %c16] : tensor<80x16xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x4xf32>
+  return %1 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(
+// CHECK-SAME:                                                                    %[[VAL_0:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:                                                                    %[[VAL_1:.*]]: index,
+// CHECK-SAME:                                                                    %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
+// CHECK:           %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK:           %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1>
+// CHECK:           %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32>
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant dense<16> : vector<1x4xindex>
+// CHECK:           %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex>
+// CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex>
+// CHECK:           %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : vector<4xindex> to vector<1x4xindex>
+// CHECK:           %[[VAL_11:.*]] = arith.muli %[[VAL_10]], %[[VAL_7]] : vector<1x4xindex>
+// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_7]] : vector<1x4xindex>
+// CHECK:           %[[VAL_13:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {{\[}}%[[VAL_12]]], %[[VAL_4]], %[[VAL_5]] : tensor<80x16xf32>, vector<1x4xindex>, vector<1x4xi1>, vector<1x4xf32> into vector<1x4xf32>
+// CHECK:           %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+// CHECK:           return %[[VAL_14]] : tensor<1x4xf32>
+// CHECK:         }
+
+transform.sequence failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+   %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+   %2 = transform.structured.vectorize %1 { vectorize_nd_extract }
+ }
+
+// -----
+
+// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Gather load.
+func.func @vectorize_nd_tensor_extract_with_maxsi_gather(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
+  %c79 = arith.constant 79 : index
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%extracted_slice : tensor<1x4xf32>) {
+  ^bb0(%out: f32):
+    %2 = linalg.index 1 : index
+    %3 = arith.maxsi %2, %c79 : index
+    %extracted = tensor.extract %arg0[%3, %2] : tensor<80x16xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x4xf32>
+  return %1 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_maxsi_gather(
+// CHECK-SAME:                                                             %[[VAL_0:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:                                                             %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK:           %[[VAL_3:.*]] = arith.constant dense<1264> : vector<1x4xindex>
+// CHECK:           %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1>
+// CHECK:           %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32>
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = vector.broadcast %[[VAL_2]] : vector<4xindex> to vector<1x4xindex>
+// CHECK:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : vector<1x4xindex>
+// CHECK:           %[[VAL_9:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {{\[}}%[[VAL_8]]], %[[VAL_4]], %[[VAL_5]] : tensor<80x16xf32>, vector<1x4xindex>, vector<1x4xi1>, vector<1x4xf32> into vector<1x4xf32>
+// CHECK:           %[[VAL_10:.*]] = vector.transfer_write %[[VAL_9]], %[[VAL_1]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+// CHECK:           return %[[VAL_10]] : tensor<1x4xf32>
+// CHECK:         }
+
+transform.sequence failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+   %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+   %2 = transform.structured.vectorize %1 { vectorize_nd_extract }
+ }
+
+// -----
+
+// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Contiguous load.
+func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
+  %c16 = arith.constant 16 : index
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%extracted_slice : tensor<1x4xf32>) {
+  ^bb0(%out: f32):
+    %2 = linalg.index 0 : index
+    %3 = linalg.index 1 : index
+    %4 = arith.maxsi %2, %c16 : index
+    %extracted = tensor.extract %arg0[%4, %3] : tensor<80x16xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x4xf32>
+  return %1 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(
+// CHECK-SAME:                                                                 %[[VAL_0:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:                                                                 %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant dense<16> : vector<1x4xindex>
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_6:.*]] = vector.shape_cast %[[VAL_2]] : vector<1x4xindex> to vector<4xindex>
+// CHECK:           %[[VAL_7:.*]] = vector.extractelement %[[VAL_6]]{{\[}}%[[VAL_3]] : i32] : vector<4xindex>
+// CHECK:           %[[VAL_8:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_4]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32>
+// CHECK:           %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_1]]{{\[}}%[[VAL_4]], %[[VAL_4]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+// CHECK:           return %[[VAL_9]] : tensor<1x4xf32>
+// CHECK:         }
+
+transform.sequence failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+   %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+   %2 = transform.structured.vectorize %1 { vectorize_nd_extract }
+ }
+
+// -----
+
+// The vectorizer assumes it's a gather load whenever using a block argument to calculate an index.
+#map = affine_map<(d0) -> (d0)>
+func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1: tensor<5xindex>) -> tensor<5xf32> {
+ %0 = tensor.empty() : tensor<5xf32>
+ %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg1: tensor<5xindex>) outs(%0 : tensor<5xf32>) {
+ ^bb0(%in: index, %out: f32):
+   %2 = linalg.index 0 : index
+   %extracted_0 = tensor.extract %arg0[%in, %2] : tensor<5x6xf32>
+   linalg.yield %extracted_0 : f32
+ } -> tensor<5xf32>
+ return %1 : tensor<5xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_block_arg(
+// CHECK-SAME:                                                     %[[VAL_0:.*]]: tensor<5x6xf32>,
+// CHECK-SAME:                                                     %[[VAL_1:.*]]: tensor<5xindex>) -> tensor<5xf32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3, 4]> : vector<5xindex>
+// CHECK:           %[[VAL_4:.*]] = arith.constant dense<true> : vector<5xi1>
+// CHECK:           %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<5xf32>
+// CHECK:           %[[VAL_6:.*]] = arith.constant dense<6> : vector<5xindex>
+// CHECK:           %[[VAL_7:.*]] = tensor.empty() : tensor<5xf32>
+// CHECK:           %[[VAL_8:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_2]]], %[[VAL_2]] {in_bounds = [true]} : tensor<5xindex>, vector<5xindex>
+// CHECK:           %[[VAL_9:.*]] = arith.muli %[[VAL_8]], %[[VAL_6]] : vector<5xindex>
+// CHECK:           %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_3]] : vector<5xindex>
+// CHECK:           %[[VAL_11:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_2]], %[[VAL_2]]] {{\[}}%[[VAL_10]]], %[[VAL_4]], %[[VAL_5]] : tensor<5x6xf32>, vector<5xindex>, vector<5xi1>, vector<5xf32> into vector<5xf32>
+// CHECK:           %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_7]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32>
+// CHECK:           return %[[VAL_12]] : tensor<5xf32>
+// CHECK:         }
+
 transform.sequence failures(propagate) {
  ^bb1(%arg1: !pdl.operation):
    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation