diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1223,15 +1223,15 @@
     permutation or broadcasting. Elements whose corresponding mask element is
     `0` are masked out and replaced with `padding`.
 
-    An optional boolean array attribute `in_bounds` specifies for every vector
-    dimension if the transfer is guaranteed to be within the source bounds.
-    While the starting point of the transfer has to be in-bounds, accesses may
-    run out-of-bounds as indices increase. Broadcast dimensions must always be
-    in-bounds. If specified, the `in_bounds` array length has to be equal to the
-    vector rank. In absence of the attribute, accesses along all dimensions
-    (except for broadcasts) may run out-of-bounds. A `vector.transfer_read` can
-    be lowered to a simple load if all dimensions are specified to be within
-    bounds and no `mask` was specified.
+    An optional boolean array attribute `in_bounds` specifies for every tensor/
+    memref dimension if the transfer is guaranteed to be within the source
+    bounds. While the starting point of the transfer has to be in-bounds,
+    accesses may run out-of-bounds as indices increase. If specified, the
+    `in_bounds` array length has to be equal to the rank of the source. In
+    absence of the attribute, accesses along all dimensions may run
+    out-of-bounds. A `vector.transfer_read` can be lowered to a simple load if
+    all dimensions are specified to be within bounds and no `mask` was
+    specified.
 
     This operation is called 'read' by opposition to 'load' because the
     super-vector granularity is generally not representable with a single
@@ -1464,14 +1464,15 @@
     specified to mask out elements. Elements whose corresponding mask element
     is `0` are masked out.
 
-    An optional boolean array attribute `in_bounds` specifies for every vector
-    dimension if the transfer is guaranteed to be within the source bounds.
-    While the starting point of the transfer has to be in-bounds, accesses may
-    run out-of-bounds as indices increase. If specified, the `in_bounds` array
-    length has to be equal to the vector rank. In absence of the attribute,
-    accesses along all dimensions may run out-of-bounds. A
-    `vector.transfer_write` can be lowered to a simple store if all dimensions
-    are specified to be within bounds and no `mask` was specified.
+    An optional boolean array attribute `in_bounds` specifies for every tensor/
+    memref dimension if the transfer is guaranteed to be within the source
+    bounds. While the starting point of the transfer has to be in-bounds,
+    accesses may run out-of-bounds as indices increase. If specified, the
+    `in_bounds` array length has to be equal to the rank of the source. In
+    absence of the attribute, accesses along all dimensions may run
+    out-of-bounds. A `vector.transfer_write` can be lowered to a simple store
+    if all dimensions are specified to be within bounds and no `mask` was
+    specified.
 
     This operation is called 'write' by opposition to 'store' because the
     super-vector granularity is generally not representable with a single
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
@@ -122,7 +122,7 @@
 ///      memref.cast %alloc: memref<B...> to compatibleMemRefType
 ///      scf.yield %4 : compatibleMemRefType, index, index
 //     }
-///    %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true ... true]}
+///    %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true, true]}
 /// ```
 /// where `alloc` is a top of the function alloca'ed buffer of one vector.
 ///
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
@@ -89,8 +89,7 @@
 ///      memref.cast %alloc: memref<B...> to compatibleMemRefType
 ///      scf.yield %4 : compatibleMemRefType, index, index
 //     }
-///    %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true ...
-///    true]}
+///    %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true, true]}
 /// ```
 /// where `alloc` is a top of the function alloca'ed buffer of one vector.
 ///
diff --git a/mlir/include/mlir/Interfaces/VectorInterfaces.td b/mlir/include/mlir/Interfaces/VectorInterfaces.td
--- a/mlir/include/mlir/Interfaces/VectorInterfaces.td
+++ b/mlir/include/mlir/Interfaces/VectorInterfaces.td
@@ -75,9 +75,8 @@
       /*args=*/(ins "unsigned":$dim),
       /*methodBody=*/"",
       /*defaultImplementation=*/[{
-        return $_op.isBroadcastDim(dim)
-            || ($_op.getInBounds()
-                && cast<::mlir::BoolAttr>(cast<::mlir::ArrayAttr>(*$_op.getInBounds())[dim]).getValue());
+        return $_op.getInBounds()
+            && cast<::mlir::BoolAttr>(cast<::mlir::ArrayAttr>(*$_op.getInBounds())[dim]).getValue();
       }]
     >,
     InterfaceMethod<
@@ -216,7 +215,8 @@
       /*args=*/(ins),
       /*methodBody=*/"",
       /*defaultImplementation=*/[{
-        for (unsigned idx = 0, e = $_op.getTransferRank(); idx < e; ++idx)
+        for (unsigned idx = 0, e = $_op.getShapedType().getRank();
+             idx < e; ++idx)
           if (!$_op.isDimInBounds(idx))
             return true;
         return false;
diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -107,6 +107,21 @@
   }
 }
 
+/// Calculate the in_bounds attribute of the new vector transfer op. The dropped
+/// vector transfer dimension is now in-bounds; an scf.if check was generated
+/// around the new transfer op.
+template <typename OpTy>
+static ArrayAttr getXferInBoundsAttr(OpBuilder &b, OpTy xferOp) {
+  SmallVector<bool> inBounds(xferOp.getShapedType().getRank(), false);
+  if (xferOp.getInBounds().has_value())
+    inBounds = extractFromIntegerArrayAttr<bool>(xferOp.getInBoundsAttr());
+  auto dim = unpackedDim(xferOp);
+  bool isBroadcast = !dim.has_value();
+  if (!isBroadcast)
+    inBounds[*dim] = true;
+  return b.getBoolArrayAttr(inBounds);
+}
+
 static void maybeYieldValue(OpBuilder &b, Location loc, bool hasRetVal,
                             Value value) {
   if (hasRetVal) {
@@ -173,7 +188,7 @@
   bool isBroadcast = !dim; // No in-bounds check for broadcasts.
   Location loc = xferOp.getLoc();
   ImplicitLocOpBuilder lb(xferOp.getLoc(), b);
-  if (!xferOp.isDimInBounds(0) && !isBroadcast) {
+  if (!isBroadcast && !xferOp.isDimInBounds(*dim)) {
     Value memrefDim =
         vector::createOrFoldDimOp(b, loc, xferOp.getSource(), *dim);
     AffineExpr d0, d1;
@@ -239,13 +254,6 @@
       });
 }
 
-/// Given an ArrayAttr, return a copy where the first element is dropped.
-static ArrayAttr dropFirstElem(OpBuilder &b, ArrayAttr attr) {
-  if (!attr)
-    return attr;
-  return ArrayAttr::get(b.getContext(), attr.getValue().drop_front());
-}
-
 /// Add the pass label to a vector transfer op if its rank is not the target
 /// rank.
 template <typename OpTy>
@@ -410,11 +418,10 @@
     Location loc = xferOp.getLoc();
     auto bufferType = dyn_cast<ShapedType>(buffer.getType());
     auto vecType = dyn_cast<VectorType>(bufferType.getElementType());
-    auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
     auto newXferOp = b.create<vector::TransferReadOp>(
         loc, vecType, xferOp.getSource(), xferIndices,
         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)),
-        xferOp.getPadding(), Value(), inBoundsAttr);
+        xferOp.getPadding(), Value(), getXferInBoundsAttr(b, xferOp));
 
     maybeApplyPassLabel(b, newXferOp, options.targetRank);
 
@@ -497,13 +504,12 @@
 
     Location loc = xferOp.getLoc();
     auto vec = b.create<memref::LoadOp>(loc, buffer, loadIndices);
-    auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
     auto source = loopState.empty() ? xferOp.getSource() : loopState[0];
     Type type = isTensorOp(xferOp) ? xferOp.getShapedType() : Type();
     auto newXferOp = b.create<vector::TransferWriteOp>(
         loc, type, vec, source, xferIndices,
         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
-        inBoundsAttr);
+        getXferInBoundsAttr(b, xferOp));
 
     maybeApplyPassLabel(b, newXferOp, options.targetRank);
 
@@ -931,11 +937,10 @@
             getInsertionIndices(xferOp, insertionIndices);
             insertionIndices.push_back(i);
 
-            auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
             auto newXferOp = b.create<vector::TransferReadOp>(
                 loc, newXferVecType, xferOp.getSource(), xferIndices,
                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)),
-                xferOp.getPadding(), Value(), inBoundsAttr);
+                xferOp.getPadding(), Value(), getXferInBoundsAttr(b, xferOp));
             maybeAssignMask(b, xferOp, newXferOp, i);
             return b.create<vector::InsertOp>(loc, newXferOp, vec,
                                               insertionIndices);
@@ -1059,11 +1064,10 @@
 
             auto extracted =
                 b.create<vector::ExtractOp>(loc, vec, extractionIndices);
-            auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
             auto newXferOp = b.create<vector::TransferWriteOp>(
                 loc, sourceType, extracted, source, xferIndices,
                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
-                inBoundsAttr);
+                getXferInBoundsAttr(b, xferOp));
 
             maybeAssignMask(b, xferOp, newXferOp, i);
 
@@ -1214,7 +1218,8 @@
 /// E.g.:
 /// ```
 /// vector.transfer_write %vec, %A[%a, %b]
-///    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]}
+///    {permutation_map = affine_map<(d0, d1) -> (d0)>,
+///     in_bounds = [true, true]}
 ///    : vector<9xf32>, memref<?x?xf32>
 /// ```
 /// Is rewritten to approximately the following pseudo-IR:
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -1201,8 +1201,14 @@
   LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
   LLVM_DEBUG(permutationMap.print(dbgs()));
 
+  // Non-transfer dims are in-bounds.
+  SmallVector<bool> inBounds(memRefType.getRank(), true);
+  for (AffineExpr expr : permutationMap.getResults())
+    if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
+      inBounds[dimExpr.getPosition()] = false;
   auto transfer = state.builder.create<vector::TransferReadOp>(
-      loadOp.getLoc(), vectorType, loadOp.getMemRef(), indices, permutationMap);
+      loadOp.getLoc(), vectorType, loadOp.getMemRef(), indices, permutationMap,
+      inBounds);
 
   // Register replacement for future uses in the scope.
   state.registerOpVectorReplacement(loadOp, transfer);
@@ -1244,9 +1250,14 @@
   LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
   LLVM_DEBUG(permutationMap.print(dbgs()));
 
+  // Non-transfer dims are in-bounds.
+  SmallVector<bool> inBounds(memRefType.getRank(), true);
+  for (AffineExpr expr : permutationMap.getResults())
+    if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
+      inBounds[dimExpr.getPosition()] = false;
   auto transfer = state.builder.create<vector::TransferWriteOp>(
       storeOp.getLoc(), vectorValue, storeOp.getMemRef(), indices,
-      permutationMap);
+      permutationMap, inBounds);
   LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << transfer);
 
   // Register replacement for future uses in the scope.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -640,7 +640,7 @@
   // be in-bounds.
   if (auto maskOp = dyn_cast<vector::MaskingOpInterface>(write)) {
     auto maskedWriteOp = cast<vector::TransferWriteOp>(maskOp.getMaskableOp());
-    SmallVector<bool> inBounds(maskedWriteOp.getVectorType().getRank(), true);
+    SmallVector<bool> inBounds(maskedWriteOp.getShapedType().getRank(), true);
     maskedWriteOp.setInBoundsAttr(rewriter.getBoolArrayAttr(inBounds));
   }
 
@@ -1083,7 +1083,7 @@
   // `tensor.extract_element` is always in-bounds, hence the following holds.
   auto dstRank = resultType.getRank();
   auto srcRank = extractOp.getTensor().getType().getRank();
-  SmallVector<bool> inBounds(dstRank, true);
+  SmallVector<bool> inBounds(srcRank, true);
 
   // 2a. Handle scalar broadcast access.
   if (memAccessKind == VectorMemoryAccessKind::ScalarBroadcast) {
@@ -1337,15 +1337,21 @@
 
     SmallVector<Value> indices(linalgOp.getShape(opOperand).size(), zero);
 
+    // Non-transfer dims are in-bounds, all others are out-of-bounds.
+    SmallVector<bool> inBounds(indices.size(), true);
+    for (AffineExpr expr : readMap.getResults())
+      if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
+        inBounds[dimExpr.getPosition()] = false;
+
     Operation *read = rewriter.create<vector::TransferReadOp>(
-        loc, readType, opOperand->get(), indices, readMap);
+        loc, readType, opOperand->get(), indices, readMap, inBounds);
     read = state.maskOperation(rewriter, read, linalgOp, maskingMap);
     Value readValue = read->getResult(0);
 
     // 3.b. If masked, set in-bounds to true. Masking guarantees that the access
     // will be in-bounds.
     if (auto maskOp = dyn_cast<vector::MaskingOpInterface>(read)) {
-      SmallVector<bool> inBounds(readType.getRank(), true);
+      SmallVector<bool> inBounds(indices.size(), true);
       cast<vector::TransferReadOp>(maskOp.getMaskableOp())
           .setInBoundsAttr(rewriter.getBoolArrayAttr(inBounds));
     }
@@ -1927,7 +1933,7 @@
 /// is rewritten to:
 /// ```
 /// %r = vector.transfer_read %src[%c0, %c0], %padding
-///     {in_bounds = [true, true]}
+///     {in_bounds = [false, false]}
 ///     : tensor<?x?xf32>, vector<17x5xf32>
 /// ```
 /// Note: By restricting this pattern to in-bounds TransferReadOps, we can be
@@ -1956,7 +1962,7 @@
       return failure();
 
     rewriter.updateRootInPlace(xferOp, [&]() {
-      SmallVector<bool> inBounds(xferOp.getVectorType().getRank(), false);
+      SmallVector<bool> inBounds(xferOp.getShapedType().getRank(), false);
       xferOp->setAttr(xferOp.getInBoundsAttrName(),
                       rewriter.getBoolArrayAttr(inBounds));
       xferOp.getSourceMutable().assign(padOp.getSource());
@@ -2033,7 +2039,7 @@
     // Insert the new TransferWriteOp at position of the old TransferWriteOp.
     rewriter.setInsertionPoint(xferOp);
 
-    SmallVector<bool> inBounds(xferOp.getVectorType().getRank(), false);
+    SmallVector<bool> inBounds(xferOp.getShapedType().getRank(), false);
     auto newXferOp = rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
         xferOp, padOp.getSource().getType(), xferOp.getVector(),
         padOp.getSource(), xferOp.getIndices(), xferOp.getPermutationMapAttr(),
@@ -2142,7 +2148,8 @@
 /// %0 = vector.transfer_read %src[%c0, %c0], %padding
 ///     : tensor<?x?xf32>, vector<17x5xf32>
 /// %r = vector.transfer_write %0, %dest[%a, %b, %c0, %c0]
-///     {in_bounds = [true, true]} : vector<17x5xf32>, tensor<?x?x17x5xf32>
+///     {in_bounds = [true, true, true, true]} : vector<17x5xf32>,
+///     tensor<?x?x17x5xf32>
 /// ```
 ///
 /// This rewrite is possible if:
@@ -2208,7 +2215,7 @@
     // source must fit into the destination at the specified offsets.
     auto writeIndices =
         ofrToIndexValues(rewriter, padOp.getLoc(), insertOp.getMixedOffsets());
-    SmallVector<bool> inBounds(vecRank, true);
+    SmallVector<bool> inBounds(tensorRank, true);
     rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
         insertOp, read, insertOp.getDest(), writeIndices,
         ArrayRef<bool>{inBounds});
@@ -2337,11 +2344,17 @@
   // The `masked` attribute is only valid on this padded buffer.
   // When forwarding to vector.transfer_read, the attribute must be reset
   // conservatively.
+
+  // in_bounds is explicitly reset. Non-transfer dims are in-bounds, all others
+  // are out-of-bounds.
+  SmallVector<bool> inBounds(xferOp.getIndices().size(), true);
+  for (AffineExpr expr : xferOp.getPermutationMap().getResults())
+    if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
+      inBounds[dimExpr.getPosition()] = false;
   Value res = rewriter.create<vector::TransferReadOp>(
       xferOp.getLoc(), xferOp.getVectorType(), in, xferOp.getIndices(),
       xferOp.getPermutationMapAttr(), xferOp.getPadding(), xferOp.getMask(),
-      // in_bounds is explicitly reset
-      /*inBoundsAttr=*/ArrayAttr());
+      rewriter.getBoolArrayAttr(inBounds));
 
   if (maybeFillOp)
     rewriter.eraseOp(maybeFillOp);
@@ -2395,11 +2408,17 @@
   // The `masked` attribute is only valid on this padded buffer.
   // When forwarding to vector.transfer_write, the attribute must be reset
   // conservatively.
+
+  // in_bounds is explicitly reset. Non-transfer dims are in-bounds, all others
+  // are out-of-bounds.
+  SmallVector<bool> inBounds(xferOp.getIndices().size(), true);
+  for (AffineExpr expr : xferOp.getPermutationMap().getResults())
+    if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
+      inBounds[dimExpr.getPosition()] = false;
   rewriter.create<vector::TransferWriteOp>(
       xferOp.getLoc(), xferOp.getVector(), out, xferOp.getIndices(),
       xferOp.getPermutationMapAttr(), xferOp.getMask(),
-      // in_bounds is explicitly reset
-      /*inBoundsAttr=*/ArrayAttr());
+      rewriter.getBoolArrayAttr(inBounds));
 
   rewriter.eraseOp(copyOp);
   rewriter.eraseOp(xferOp);
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -45,6 +45,32 @@
 // Utility functions
 //===----------------------------------------------------------------------===//
 
+/// Compute the new in_bounds attribute of a vector transfer op when folding a
+/// potentially rank-reducing alias op into a vector transfer op.
+template <typename OpTy>
+static ArrayAttr
+expandInBoundsToRank(Builder &b, OpTy xferOp, int64_t rank,
+                     const llvm::SmallBitVector &projectedDimensions) {
+  SmallVector<bool> inBounds(xferOp.getShapedType().getRank(), false);
+  if (xferOp.getInBounds().has_value())
+    inBounds = extractFromIntegerArrayAttr<bool>(xferOp.getInBoundsAttr());
+  SmallVector<bool> expandedInBounds;
+  int64_t idx = 0;
+  for (int64_t i = 0; i < rank; ++i) {
+    if (projectedDimensions.test(i)) {
+      // This dimension was rank-reduced. It must be in-bounds after folding.
+      expandedInBounds.push_back(true);
+      continue;
+    }
+
+    // Not a rank-reduced dim: take in_bounds from the xfer op.
+    assert(idx < inBounds.size() && "invalid rank");
+    expandedInBounds.push_back(inBounds[idx++]);
+  }
+  assert(idx == inBounds.size() && "invalid rank");
+  return b.getBoolArrayAttr(expandedInBounds);
+}
+
 /// Given the 'indices' of a load/store operation where the memref is a result
 /// of a expand_shape op, returns the indices w.r.t to the source memref of the
 /// expand_shape op. For example
@@ -404,12 +430,14 @@
             op, op.getType(), subViewOp.getSource(), sourceIndices);
       })
       .Case([&](vector::TransferReadOp op) {
+        int64_t rank = subViewOp.getSourceType().getRank();
         rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
             op, op.getVectorType(), subViewOp.getSource(), sourceIndices,
-            AffineMapAttr::get(expandDimsToRank(
-                op.getPermutationMap(), subViewOp.getSourceType().getRank(),
-                subViewOp.getDroppedDims())),
-            op.getPadding(), /*mask=*/Value(), op.getInBoundsAttr());
+            AffineMapAttr::get(expandDimsToRank(op.getPermutationMap(), rank,
+                                                subViewOp.getDroppedDims())),
+            op.getPadding(), /*mask=*/Value(),
+            expandInBoundsToRank(rewriter, op, rank,
+                                 subViewOp.getDroppedDims()));
       })
       .Case([&](gpu::SubgroupMmaLoadMatrixOp op) {
         rewriter.replaceOpWithNewOp<gpu::SubgroupMmaLoadMatrixOp>(
@@ -533,12 +561,13 @@
             op.getNontemporal());
       })
       .Case([&](vector::TransferWriteOp op) {
+        int64_t rank = subViewOp.getSourceType().getRank();
         rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
             op, op.getValue(), subViewOp.getSource(), sourceIndices,
-            AffineMapAttr::get(expandDimsToRank(
-                op.getPermutationMap(), subViewOp.getSourceType().getRank(),
-                subViewOp.getDroppedDims())),
-            op.getInBoundsAttr());
+            AffineMapAttr::get(expandDimsToRank(op.getPermutationMap(), rank,
+                                                subViewOp.getDroppedDims())),
+            expandInBoundsToRank(rewriter, op, rank,
+                                 subViewOp.getDroppedDims()));
       })
       .Case([&](gpu::SubgroupMmaStoreMatrixOp op) {
         rewriter.replaceOpWithNewOp<gpu::SubgroupMmaStoreMatrixOp>(
diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
--- a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
@@ -84,6 +84,32 @@
   return success();
 }
 
+/// Compute the new in_bounds attribute of a vector transfer op when folding a
+/// potentially rank-reducing subset op into a vector transfer op.
+template <typename OpTy>
+static ArrayAttr
+expandInBoundsToRank(Builder &b, OpTy xferOp, int64_t rank,
+                     const llvm::SmallBitVector &projectedDimensions) {
+  SmallVector<bool> inBounds(xferOp.getShapedType().getRank(), false);
+  if (xferOp.getInBounds().has_value())
+    inBounds = extractFromIntegerArrayAttr<bool>(xferOp.getInBoundsAttr());
+  SmallVector<bool> expandedInBounds;
+  int64_t idx = 0;
+  for (int64_t i = 0; i < rank; ++i) {
+    if (projectedDimensions.test(i)) {
+      // This dimension was rank-reduced. It must be in-bounds after folding.
+      expandedInBounds.push_back(true);
+      continue;
+    }
+
+    // Not a rank-reduced dim: take in_bounds from the xfer op.
+    assert(idx < inBounds.size() && "invalid rank");
+    expandedInBounds.push_back(inBounds[idx++]);
+  }
+  assert(idx == inBounds.size() && "invalid rank");
+  return b.getBoolArrayAttr(expandedInBounds);
+}
+
 LogicalResult TransferReadOfExtractSliceOpFolder::matchAndRewrite(
     vector::TransferReadOp readOp, PatternRewriter &rewriter) const {
   auto extractSliceOp =
@@ -105,13 +131,16 @@
       extractSliceOp.getMixedStrides(), extractSliceOp.getDroppedDims(),
       indices, sourceIndices);
 
+  int64_t expandedRank = extractSliceOp.getSourceType().getRank();
   rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
       readOp, readOp.getVectorType(), extractSliceOp.getSource(), sourceIndices,
-      AffineMapAttr::get(expandDimsToRank(
-          readOp.getPermutationMap(), extractSliceOp.getSourceType().getRank(),
-          extractSliceOp.getDroppedDims())),
+      AffineMapAttr::get(expandDimsToRank(readOp.getPermutationMap(),
+                                          expandedRank,
+                                          extractSliceOp.getDroppedDims())),
       readOp.getPadding(),
-      /*mask=*/Value(), readOp.getInBoundsAttr());
+      /*mask=*/Value(),
+      expandInBoundsToRank(rewriter, readOp, expandedRank,
+                           extractSliceOp.getDroppedDims()));
 
   return success();
 }
@@ -137,12 +166,14 @@
       insertSliceOp.getMixedStrides(), insertSliceOp.getDroppedDims(), indices,
       sourceIndices);
 
+  int64_t expandedRank = insertSliceOp.getDestType().getRank();
   rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
       insertSliceOp, writeOp.getValue(), insertSliceOp.getDest(), sourceIndices,
       AffineMapAttr::get(expandDimsToRank(writeOp.getPermutationMap(),
-                                          insertSliceOp.getDestType().getRank(),
+                                          expandedRank,
                                           insertSliceOp.getDroppedDims())),
-      writeOp.getInBoundsAttr());
+      expandInBoundsToRank(rewriter, writeOp, expandedRank,
+                           insertSliceOp.getDroppedDims()));
 
   return success();
 }
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -3464,17 +3464,28 @@
            << ") don't match";
 
   if (inBounds) {
-    if (permutationMap.getNumResults() != static_cast<int64_t>(inBounds.size()))
+    if (shapedType.getRank() != static_cast<int64_t>(inBounds.size()))
       return op->emitOpError("expects the optional in_bounds attr of same rank "
-                             "as permutation_map results: ")
-             << AffineMapAttr::get(permutationMap)
+                             "as the source type: ")
+             << shapedType.getRank()
              << " vs inBounds of size: " << inBounds.size();
-    for (unsigned int i = 0; i < permutationMap.getNumResults(); ++i)
-      if (permutationMap.getResult(i).isa<AffineConstantExpr>() &&
-          !llvm::cast<BoolAttr>(inBounds.getValue()[i]).getValue())
-        return op->emitOpError("requires broadcast dimensions to be in-bounds");
   }
 
+  // Make sure that all non-transfer dimensions are in-bounds.
+  SmallVector<bool> inBoundsVals(op.getShapedType().getRank(), false);
+  if (inBounds)
+    inBoundsVals = extractFromIntegerArrayAttr<bool>(inBounds);
+  DenseSet<int64_t> xferDims;
+  for (AffineExpr expr : permutationMap.getResults()) {
+    if (auto dimExpr = expr.template dyn_cast<AffineDimExpr>())
+      xferDims.insert(dimExpr.getPosition());
+  }
+  for (int64_t i = 0, e = op.getShapedType().getRank(); i < e; ++i)
+    if (!xferDims.contains(i) && !op.isDimInBounds(i)) {
+      return op->emitOpError(
+          "expects that all non-transfer dims are in-bounds");
+    }
+
   return success();
 }
 
@@ -3654,32 +3665,42 @@
 
 template <typename TransferOp>
 static LogicalResult foldTransferInBoundsAttribute(TransferOp op) {
-  // TODO: support 0-d corner case.
-  // TODO: Be less conservative.
-  if (op.getTransferRank() == 0)
-    return failure();
-  AffineMap permutationMap = op.getPermutationMap();
+  SmallVector<int64_t> accessedChunk = op.getTransferChunkAccessed();
+
+  // Prepare new in_bounds values.
   bool changed = false;
-  SmallVector<bool, 4> newInBounds;
-  newInBounds.reserve(op.getTransferRank());
-  for (unsigned i = 0; i < op.getTransferRank(); ++i) {
+  SmallVector<bool> newInBounds(op.getShapedType().getRank(), false);
+  if (op.getInBounds().has_value())
+    newInBounds = extractFromIntegerArrayAttr<bool>(op.getInBoundsAttr());
+
+  for (unsigned i = 0; i < op.getShapedType().getRank(); ++i) {
     // Already marked as in-bounds, nothing to see here.
-    if (op.isDimInBounds(i)) {
-      newInBounds.push_back(true);
+    if (newInBounds[i])
       continue;
-    }
-    // Currently out-of-bounds, check whether we can statically determine it is
-    // inBounds.
-    auto dimExpr = permutationMap.getResult(i).dyn_cast<AffineDimExpr>();
-    assert(dimExpr && "Broadcast dims must be in-bounds");
-    auto inBounds =
-        isInBounds(op, /*resultIdx=*/i, /*indicesIdx=*/dimExpr.getPosition());
-    newInBounds.push_back(inBounds);
-    // We commit the pattern if it is "more inbounds".
+
+    // Cannot infer in_bounds for dynamic dimensions.
+    if (op.getShapedType().isDynamicDim(i))
+      continue;
+    int64_t sourceSize = op.getShapedType().getDimSize(i);
+
+    // Cannot infer in_bounds for non-constant indices.
+    Value index = op.getIndices()[i];
+    std::optional<int64_t> constantOffset = getConstantIntValue(index);
+    if (!constantOffset.has_value())
+      continue;
+
+    // If the dimension is not part of the transfer shape, it's like a "1"
+    // dimensions that is casted away.
+    int64_t vectorSize = accessedChunk[i] == 0 ? 1 : accessedChunk[i];
+
+    bool inBounds = constantOffset.value() + vectorSize <= sourceSize;
+    newInBounds[i] = inBounds;
     changed |= inBounds;
   }
+
   if (!changed)
     return failure();
+
   // OpBuilder is only used as a helper to build an I64ArrayAttr.
   OpBuilder b(op.getContext());
   op->setAttr(TransferOp::getInBoundsAttrStrName(),
@@ -3747,11 +3768,11 @@
 /// Example:
 /// ```
 /// %w0 = vector.transfer_write %v0, %arg0[%c0, %c0, %c0]
-///  {in_bounds = [true, true],
+///  {in_bounds = [true, true, true],
 ///   permutation_map = affine_map<(d0, d1, d2) -> (d2, d1)>} :
 ///   vector<4x1xf32>, tensor<4x4x4xf32>
 ///  %r = vector.transfer_read %w0[%c0, %c0, %c0], %cf0
-///   {in_bounds = [true, true, true, true],
+///   {in_bounds = [true, true, true],
 ///   permutation_map = affine_map<(d0, d1, d2) -> (d1, 0, d2, 0)>} :
 ///   tensor<4x4x4xf32>, vector<1x100x4x5xf32>
 /// ```
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
@@ -20,19 +20,6 @@
 using namespace mlir;
 using namespace mlir::vector;
 
-/// Transpose a vector transfer op's `in_bounds` attribute by applying reverse
-/// permutation based on the given indices.
-static ArrayAttr
-inverseTransposeInBoundsAttr(OpBuilder &builder, ArrayAttr attr,
-                             const SmallVector<unsigned> &permutation) {
-  SmallVector<bool> newInBoundsValues(permutation.size());
-  size_t index = 0;
-  for (unsigned pos : permutation)
-    newInBoundsValues[pos] =
-        cast<BoolAttr>(attr.getValue()[index++]).getValue();
-  return builder.getBoolArrayAttr(newInBoundsValues);
-}
-
 /// Extend the rank of a vector Value by `addedRanks` by adding outer unit
 /// dimensions.
 static Value extendVectorRank(OpBuilder &builder, Location loc, Value vec,
@@ -119,19 +106,13 @@
       newVectorShape[pos.value()] = originalShape[pos.index()];
     }
 
-    // Transpose in_bounds attribute.
-    ArrayAttr newInBoundsAttr =
-        op.getInBounds() ? inverseTransposeInBoundsAttr(
-                               rewriter, op.getInBounds().value(), permutation)
-                         : ArrayAttr();
-
     // Generate new transfer_read operation.
     VectorType newReadType =
         VectorType::get(newVectorShape, op.getVectorType().getElementType());
     Value newRead = rewriter.create<vector::TransferReadOp>(
         op.getLoc(), newReadType, op.getSource(), op.getIndices(),
         AffineMapAttr::get(newMap), op.getPadding(), op.getMask(),
-        newInBoundsAttr);
+        op.getInBounds() ? *op.getInBounds() : ArrayAttr());
 
     // Transpose result of transfer_read.
     SmallVector<int64_t> transposePerm(permutation.begin(), permutation.end());
@@ -189,12 +170,6 @@
                       return expr.dyn_cast<AffineDimExpr>().getPosition();
                     });
 
-    // Transpose in_bounds attribute.
-    ArrayAttr newInBoundsAttr =
-        op.getInBounds() ? inverseTransposeInBoundsAttr(
-                               rewriter, op.getInBounds().value(), permutation)
-                         : ArrayAttr();
-
     // Generate new transfer_write operation.
     Value newVec = rewriter.create<vector::TransposeOp>(
         op.getLoc(), op.getVector(), indices);
@@ -202,7 +177,7 @@
         map.getNumDims(), map.getNumResults(), rewriter.getContext());
     rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
         op, newVec, op.getSource(), op.getIndices(), AffineMapAttr::get(newMap),
-        op.getMask(), newInBoundsAttr);
+        op.getMask(), op.getInBounds() ? *op.getInBounds() : ArrayAttr());
 
     return success();
   }
@@ -272,15 +247,9 @@
     exprs.append(map.getResults().begin(), map.getResults().end());
     AffineMap newMap =
         AffineMap::get(map.getNumDims(), 0, exprs, op.getContext());
-    // All the new dimensions added are inbound.
-    SmallVector<bool> newInBoundsValues(missingInnerDim.size(), true);
-    for (int64_t i = 0, e = op.getVectorType().getRank(); i < e; ++i) {
-      newInBoundsValues.push_back(op.isDimInBounds(i));
-    }
-    ArrayAttr newInBoundsAttr = rewriter.getBoolArrayAttr(newInBoundsValues);
     rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
         op, newVec, op.getSource(), op.getIndices(), AffineMapAttr::get(newMap),
-        newMask, newInBoundsAttr);
+        newMask, op.getInBoundsAttr());
     return success();
   }
 };
@@ -353,15 +322,10 @@
 
     VectorType newReadType =
         VectorType::get(newShape, originalVecType.getElementType());
-    ArrayAttr newInBoundsAttr =
-        op.getInBounds()
-            ? rewriter.getArrayAttr(
-                  op.getInBoundsAttr().getValue().take_back(reducedShapeRank))
-            : ArrayAttr();
     Value newRead = rewriter.create<vector::TransferReadOp>(
         op.getLoc(), newReadType, op.getSource(), op.getIndices(),
         AffineMapAttr::get(newMap), op.getPadding(), op.getMask(),
-        newInBoundsAttr);
+        op.getInBoundsAttr());
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(op, originalVecType,
                                                      newRead);
     return success();
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
@@ -216,15 +216,10 @@
         AffineMap::get(oldMap.getNumDims(), oldMap.getNumSymbols(), newResults,
                        rewriter.getContext());
 
-    ArrayAttr inBoundsAttr;
-    if (read.getInBounds())
-      inBoundsAttr = rewriter.getArrayAttr(
-          read.getInBoundsAttr().getValue().take_back(newType.getRank()));
-
     auto newRead = rewriter.create<vector::TransferReadOp>(
         read.getLoc(), newType, read.getSource(), read.getIndices(),
         AffineMapAttr::get(newMap), read.getPadding(), /*mask=*/Value(),
-        inBoundsAttr);
+        read.getInBoundsAttr());
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(read, oldType, newRead);
 
     return success();
@@ -264,16 +259,11 @@
         AffineMap::get(oldMap.getNumDims(), oldMap.getNumSymbols(), newResults,
                        rewriter.getContext());
 
-    ArrayAttr inBoundsAttr;
-    if (write.getInBounds())
-      inBoundsAttr = rewriter.getArrayAttr(
-          write.getInBoundsAttr().getValue().take_back(newType.getRank()));
-
     auto newVector = rewriter.create<vector::ExtractOp>(
         write.getLoc(), write.getVector(), splatZero(dropDim));
     rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
         write, newVector, write.getSource(), write.getIndices(),
-        AffineMapAttr::get(newMap), inBoundsAttr);
+        AffineMapAttr::get(newMap), write.getInBoundsAttr());
 
     return success();
   }
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -523,7 +523,8 @@
                                                 vectorType.getElementType());
     vector::TransferReadOp flatRead = rewriter.create<vector::TransferReadOp>(
         loc, flatVectorType, collapsedSource, collapsedIndices, collapsedMap);
-    flatRead.setInBoundsAttr(rewriter.getBoolArrayAttr({true}));
+    SmallVector<bool> newInBounds(collapsedIndices.size(), true);
+    flatRead.setInBoundsAttr(rewriter.getBoolArrayAttr(newInBounds));
     rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(
         transferReadOp, cast<VectorType>(vector.getType()), flatRead);
     return success();
@@ -586,7 +587,8 @@
     vector::TransferWriteOp flatWrite =
         rewriter.create<vector::TransferWriteOp>(
             loc, flatVector, collapsedSource, collapsedIndices, collapsedMap);
-    flatWrite.setInBoundsAttr(rewriter.getBoolArrayAttr({true}));
+    SmallVector<bool> newInBounds(collapsedIndices.size(), true);
+    flatWrite.setInBoundsAttr(rewriter.getBoolArrayAttr(newInBounds));
     rewriter.eraseOp(transferWriteOp);
     return success();
   }
@@ -744,6 +746,9 @@
     // Map not supported.
     if (!xferOp.getPermutationMap().isMinorIdentity())
       return failure();
+    // Cannot rewrite out-of-bounds transfers.
+    if (xferOp.hasOutOfBoundsDim())
+      return failure();
     // Only float and integer element types are supported.
     Value scalar;
     if (vecType.getRank() == 0) {
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
@@ -40,39 +40,48 @@
 
 /// Build the condition to ensure that a particular VectorTransferOpInterface
 /// is in-bounds.
-static Value createInBoundsCond(RewriterBase &b,
-                                VectorTransferOpInterface xferOp) {
+static Value createInBoundsCondition(RewriterBase &b,
+                                     VectorTransferOpInterface xferOp) {
   assert(xferOp.permutation_map().isMinorIdentity() &&
          "Expected minor identity map");
+  SmallVector<int64_t> accessedChunk = xferOp.getTransferChunkAccessed();
   Value inBoundsCond;
-  xferOp.zipResultAndIndexing([&](int64_t resultIdx, int64_t indicesIdx) {
-    // Zip over the resulting vector shape and memref indices.
+
+  for (int64_t dim = 0, e = xferOp.getShapedType().getRank(); dim < e; ++dim) {
     // If the dimension is known to be in-bounds, it does not participate in
     // the construction of `inBoundsCond`.
-    if (xferOp.isDimInBounds(resultIdx))
-      return;
+    if (xferOp.isDimInBounds(dim))
+      continue;
+
     // Fold or create the check that `index + vector_size` <= `memref_size`.
     Location loc = xferOp.getLoc();
-    int64_t vectorSize = xferOp.getVectorType().getDimSize(resultIdx);
+    // If the dimension is not part of the transfer shape, it's like a "1"
+    // dimensions that is casted away.
+    int64_t vectorSize = accessedChunk[dim] == 0 ? 1 : accessedChunk[dim];
     OpFoldResult sum = affine::makeComposedFoldedAffineApply(
         b, loc, b.getAffineDimExpr(0) + b.getAffineConstantExpr(vectorSize),
-        {xferOp.indices()[indicesIdx]});
-    OpFoldResult dimSz =
-        memref::getMixedSize(b, loc, xferOp.source(), indicesIdx);
+        {xferOp.indices()[dim]});
+    OpFoldResult dimSz = memref::getMixedSize(b, loc, xferOp.source(), dim);
+
+    // Skip check if it is statically "true";
     auto maybeCstSum = getConstantIntValue(sum);
     auto maybeCstDimSz = getConstantIntValue(dimSz);
     if (maybeCstSum && maybeCstDimSz && *maybeCstSum <= *maybeCstDimSz)
-      return;
+      continue;
+
+    // Generate condition.
     Value cond =
         b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sle,
                                 getValueOrCreateConstantIndexOp(b, loc, sum),
                                 getValueOrCreateConstantIndexOp(b, loc, dimSz));
+
     // Conjunction over all dims for which we are in-bounds.
     if (inBoundsCond)
       inBoundsCond = b.create<arith::AndIOp>(loc, inBoundsCond, cond);
     else
       inBoundsCond = cond;
-  });
+  }
+
   return inBoundsCond;
 }
 
@@ -100,7 +109,7 @@
 ///      memref.cast %alloc: memref<B...> to compatibleMemRefType
 ///      scf.yield %4 : compatibleMemRefType, index, index
 //     }
-///    %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true ... true]}
+///    %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true, true]}
 /// ```
 /// where `alloc` is a top of the function alloca'ed buffer of one vector.
 ///
@@ -481,7 +490,7 @@
 ///      memref.cast %alloc: memref<B...> to compatibleMemRefType
 ///      scf.yield %4 : compatibleMemRefType, index, index
 //     }
-///    %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true ... true]}
+///    %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true, true]}
 /// ```
 /// where `alloc` is a top of the function alloca'ed buffer of one vector.
 ///
@@ -503,7 +512,7 @@
 ///      memref.cast %alloc: memref<B...> to compatibleMemRefType
 ///      scf.yield %4 : compatibleMemRefType, index, index
 ///     }
-///    %0 = vector.transfer_write %arg, %1#0[%1#1, %1#2] {in_bounds = [true ...
+///    %0 = vector.transfer_write %arg, %1#0[%1#1, %1#2] {in_bounds = [true,
 ///                                                                    true]}
 ///    scf.if (%notInBounds) {
 ///      // slowpath: not in-bounds vector.transfer or linalg.copy.
@@ -522,7 +531,7 @@
   if (options.vectorTransferSplit == VectorTransferSplit::None)
     return failure();
 
-  SmallVector<bool, 4> bools(xferOp.getTransferRank(), true);
+  SmallVector<bool, 4> bools(xferOp.getShapedType().getRank(), true);
   auto inBoundsAttr = b.getBoolArrayAttr(bools);
   if (options.vectorTransferSplit == VectorTransferSplit::ForceInBounds) {
     b.updateRootInPlace(xferOp, [&]() {
@@ -550,7 +559,7 @@
 
   RewriterBase::InsertionGuard guard(b);
   b.setInsertionPoint(xferOp);
-  Value inBoundsCond = createInBoundsCond(
+  Value inBoundsCond = createInBoundsCondition(
       b, cast<VectorTransferOpInterface>(xferOp.getOperation()));
   if (!inBoundsCond)
     return failure();
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -992,6 +992,8 @@
                                         bounds);
 }
 
+/// Materialize a mask for a 1D vector transfer, where the transfer dimension
+/// is out-of-bounds.
 template <typename ConcreteOp>
 struct MaterializeTransferMask : public OpRewritePattern<ConcreteOp> {
 public:
@@ -1002,10 +1004,20 @@
 
   LogicalResult matchAndRewrite(ConcreteOp xferOp,
                                 PatternRewriter &rewriter) const override {
-    if (!xferOp.hasOutOfBoundsDim())
+    if (xferOp.getVectorType().getRank() > 1 || xferOp.getIndices().empty())
       return failure();
 
-    if (xferOp.getVectorType().getRank() > 1 || xferOp.getIndices().empty())
+    // Get transfer dimension.
+    assert(xferOp.getPermutationMap().getNumResults() == 1 &&
+           "expected one result in AffineMap");
+    AffineDimExpr dimExpr = xferOp.getPermutationMap()
+                                .getResult(0)
+                                .template dyn_cast<AffineDimExpr>();
+    assert(dimExpr && "expected dimension, not broadcast");
+    int64_t transferDim = dimExpr.getPosition();
+
+    // The dimension must be out-of-bounds.
+    if (xferOp.isDimInBounds(transferDim))
       return failure();
 
     Location loc = xferOp->getLoc();
@@ -1031,9 +1043,17 @@
       mask = rewriter.create<arith::AndIOp>(loc, mask, xferOp.getMask());
     }
 
+    // The masked dimension is now in-bounds.
+    SmallVector<bool> inBounds(xferOp.getShapedType().getRank(), false);
+    if (xferOp.getInBounds().has_value())
+      inBounds = extractFromIntegerArrayAttr<bool>(xferOp.getInBoundsAttr());
+    assert(!inBounds[transferDim] &&
+           "expected that dimension was out-of-bounds");
+    inBounds[transferDim] = true;
+
     rewriter.updateRootInPlace(xferOp, [&]() {
       xferOp.getMaskMutable().assign(mask);
-      xferOp.setInBoundsAttr(rewriter.getBoolArrayAttr({true}));
+      xferOp.setInBoundsAttr(rewriter.getBoolArrayAttr(inBounds));
     });
 
     return success();
diff --git a/mlir/test/Conversion/GPUCommon/transfer_write.mlir b/mlir/test/Conversion/GPUCommon/transfer_write.mlir
--- a/mlir/test/Conversion/GPUCommon/transfer_write.mlir
+++ b/mlir/test/Conversion/GPUCommon/transfer_write.mlir
@@ -7,7 +7,7 @@
       // CHECK:%[[base:[0-9]+]] = llvm.extractvalue
       // CHECK:%[[ptr:[0-9]+]] = llvm.getelementptr %[[base]]
       // CHECK:llvm.store %[[val]], %[[ptr]]
-      vector.transfer_write %arg3, %arg1[%c0, %c0] {in_bounds = [true]} : vector<1xf32>, memref<1024x1024xf32>
+      vector.transfer_write %arg3, %arg1[%c0, %c0] {in_bounds = [true, true]} : vector<1xf32>, memref<1024x1024xf32>
     }
     return
   }
diff --git a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir
--- a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir
+++ b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir
@@ -276,17 +276,17 @@
   // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
   // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
   // CHECK: [[fragmentA:%.+]] = nvgpu.ldmatrix %arg0[[[c0]], [[m_coord]], [[c0]], [[k_coord]]] {numTiles = 4 : i32, transpose = false}
-  %A = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_a} : memref<4x32x1x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
+  %A = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true], permutation_map = #map_a} : memref<4x32x1x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
 
   // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
   // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
   // CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[c0]], [[c0]], [[k_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = true}
-  %B = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_b} : memref<4x1x32x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
+  %B = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true], permutation_map = #map_b} : memref<4x1x32x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
   
   // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
   // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
   // CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[c0]], [[m_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = false}
-  %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x32x40xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
+  %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x32x40xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
 
   // CHECK-DAG: [[fragmentB0:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
   // CHECK-DAG: [[fragmentC0:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
@@ -294,7 +294,7 @@
   %B0 = vector.extract_strided_slice %B {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
   %C0 = vector.extract_strided_slice %C {offsets = [0, 0], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16>
   %D0 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B0, %C0 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
-  vector.transfer_write %D0, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<1x32x40xf16, #gpu.address_space<workgroup>>
+  vector.transfer_write %D0, %arg2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<16x8xf16>, memref<1x32x40xf16, #gpu.address_space<workgroup>>
 
   return
 }
@@ -319,19 +319,19 @@
   // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
   // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
   // CHECK: nvgpu.ldmatrix %arg0[[[C0]], [[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<4x2xf16>
-  %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
+  %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
 
   // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
   // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
   // CHECK: nvgpu.ldmatrix %arg1[[[C0]], [[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<2x2xf16>
-  %B = vector.transfer_read %arg1[%c0, %c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<8x16xf16>
+  %B = vector.transfer_read %arg1[%c0, %c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<8x16xf16>
 
   // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
   // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
   // CHECK: nvgpu.ldmatrix %arg2[[[C0]], [[m_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = false} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<2x2xf16>
-  %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<16x8xf16>
+  %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<16x8xf16>
   %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
-  vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<2x20x20xf16, #gpu.address_space<workgroup>>
+  vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<16x8xf16>, memref<2x20x20xf16, #gpu.address_space<workgroup>>
   return
 }
 
diff --git a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir
--- a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir
+++ b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir
@@ -172,7 +172,7 @@
   %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
   %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %cst_0 : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>
   %E = vector.transfer_read %arg3[%c0, %c0, %c0, %c0], %cst
-    {in_bounds = [true, true], permutation_map = affine_map<(d0, d1, d2, d3)->(0, d3)>}
+    {in_bounds = [true, true, true, true], permutation_map = affine_map<(d0, d1, d2, d3)->(0, d3)>}
     : memref<16x16x16x16xf16>, vector<16x16xf16>
   %F = arith.divf %D, %E : vector<16x16xf16>
   vector.transfer_write %F, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16>
@@ -199,11 +199,11 @@
   %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16>
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f16
-  %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16>, vector<16x16xf16>
-  %B = vector.transfer_read %arg1[%c0], %cst {permutation_map = #map4, in_bounds = [true, true]} : memref<16xf16>, vector<16x16xf16>
-  %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16>, vector<16x16xf16>
+  %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x16x16xf16>, vector<16x16xf16>
+  %B = vector.transfer_read %arg1[%c0], %cst {permutation_map = #map4, in_bounds = [true]} : memref<16xf16>, vector<16x16xf16>
+  %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x16x16xf16>, vector<16x16xf16>
   %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>
-  vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<2x16x16xf16>
+  vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<16x16xf16>, memref<2x16x16xf16>
   return
 }
 
@@ -227,11 +227,11 @@
   %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16>
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f16
-  %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16, affine_map<(d0, d1, d2) -> (d0 * 512 + d1 * 32 + d2)>>, vector<16x16xf16>
-  %B = vector.transfer_read %arg1[%c0], %cst {permutation_map = #map4, in_bounds = [true, true]} : memref<16xf16>, vector<16x16xf16>
-  %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16>, vector<16x16xf16>
+  %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x16x16xf16, affine_map<(d0, d1, d2) -> (d0 * 512 + d1 * 32 + d2)>>, vector<16x16xf16>
+  %B = vector.transfer_read %arg1[%c0], %cst {permutation_map = #map4, in_bounds = [true]} : memref<16xf16>, vector<16x16xf16>
+  %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x16x16xf16>, vector<16x16xf16>
   %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>
-  vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<2x16x16xf16>
+  vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<16x16xf16>, memref<2x16x16xf16>
   return
 }
 
@@ -281,8 +281,8 @@
   %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16>
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f16
-  %A = vector.transfer_read %arg0[%c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0) -> (d0, 0)>} : memref<16xf16>, vector<16x16xf16>
-  %B = vector.transfer_read %arg1[%c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0) -> (d0, 0)>} : memref<16xf16>, vector<16x16xf16>
+  %A = vector.transfer_read %arg0[%c0], %cst {in_bounds = [true], permutation_map = affine_map<(d0) -> (d0, 0)>} : memref<16xf16>, vector<16x16xf16>
+  %B = vector.transfer_read %arg1[%c0], %cst {in_bounds = [true], permutation_map = affine_map<(d0) -> (d0, 0)>} : memref<16xf16>, vector<16x16xf16>
   %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
   %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>
   vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16>
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -1716,7 +1716,8 @@
 func.func @transfer_read_2d_to_1d(%A : memref<?x?xf32>, %base0: index, %base1: index) -> vector<17xf32> {
   %f7 = arith.constant 7.0: f32
   %f = vector.transfer_read %A[%base0, %base1], %f7
-      {permutation_map = affine_map<(d0, d1) -> (d1)>} :
+      {permutation_map = affine_map<(d0, d1) -> (d1)>,
+       in_bounds = [true, false]} :
     memref<?x?xf32>, vector<17xf32>
   return %f: vector<17xf32>
 }
diff --git a/mlir/test/Conversion/VectorToSCF/tensor-transfer-ops.mlir b/mlir/test/Conversion/VectorToSCF/tensor-transfer-ops.mlir
--- a/mlir/test/Conversion/VectorToSCF/tensor-transfer-ops.mlir
+++ b/mlir/test/Conversion/VectorToSCF/tensor-transfer-ops.mlir
@@ -4,7 +4,7 @@
 //       CHECK: %[[ALLOC:.*]] = memref.alloca() : memref<vector<4x9xf32>>
 //       CHECK: %[[CASTED:.*]] = vector.type_cast %[[ALLOC]] : memref<vector<4x9xf32>> to memref<4xvector<9xf32>>
 //       CHECK: scf.for {{.*}} {
-//       CHECK:   %[[READ:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %cst {in_bounds = [true]} : tensor<?x?xf32>, vector<9xf32>
+//       CHECK:   %[[READ:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %cst {in_bounds = [true, true]} : tensor<?x?xf32>, vector<9xf32>
 //       CHECK:   memref.store %[[READ]], %[[CASTED]][%{{.*}}] : memref<4xvector<9xf32>>
 //       CHECK: }
 //       CHECK: %[[LOADED:.*]] = memref.load %[[ALLOC]][] : memref<vector<4x9xf32>>
@@ -25,7 +25,7 @@
 //       CHECK: %[[CASTED:.*]] = vector.type_cast %[[ALLOC]] : memref<vector<2x3xf32>> to memref<2xvector<3xf32>>
 //       CHECK: %[[RESULT:.*]] = scf.for {{.*}} iter_args(%[[STATE:.*]] = %{{.*}}) -> (tensor<?x?xf32>) {
 //       CHECK:   %[[LOADED:.*]] = memref.load %[[CASTED]][%{{.*}}] : memref<2xvector<3xf32>>
-//       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[LOADED]], %[[STATE]][{{.*}}] {in_bounds = [true]} : vector<3xf32>, tensor<?x?xf32>
+//       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[LOADED]], %[[STATE]][{{.*}}] {in_bounds = [true, true]} : vector<3xf32>, tensor<?x?xf32>
 //       CHECK:   scf.yield %[[WRITE]] : tensor<?x?xf32>
 //       CHECK: }
 //       CHECK: return %[[RESULT]] : tensor<?x?xf32>
diff --git a/mlir/test/Conversion/VectorToSCF/unrolled-tensor-transfer-ops.mlir b/mlir/test/Conversion/VectorToSCF/unrolled-tensor-transfer-ops.mlir
--- a/mlir/test/Conversion/VectorToSCF/unrolled-tensor-transfer-ops.mlir
+++ b/mlir/test/Conversion/VectorToSCF/unrolled-tensor-transfer-ops.mlir
@@ -2,13 +2,13 @@
 
 // CHECK-LABEL: func @transfer_read_2d(
 //       CHECK: %[[V_INIT:.*]] = arith.constant dense<-4.200000e+01> : vector<4x9xf32>
-//       CHECK: %[[V0:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true]} : tensor<?x?xf32>, vector<9xf32>
+//       CHECK: %[[V0:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<9xf32>
 //       CHECK: %[[I0:.*]] = vector.insert %[[V0]], %[[V_INIT]] [0] : vector<9xf32> into vector<4x9xf32>
-//       CHECK: %[[V1:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true]} : tensor<?x?xf32>, vector<9xf32>
+//       CHECK: %[[V1:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<9xf32>
 //       CHECK: %[[I1:.*]] = vector.insert %[[V1]], %[[I0]] [1] : vector<9xf32> into vector<4x9xf32>
-//       CHECK: %[[V2:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true]} : tensor<?x?xf32>, vector<9xf32>
+//       CHECK: %[[V2:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<9xf32>
 //       CHECK: %[[I2:.*]] = vector.insert %[[V2]], %[[I1]] [2] : vector<9xf32> into vector<4x9xf32>
-//       CHECK: %[[V3:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true]} : tensor<?x?xf32>, vector<9xf32>
+//       CHECK: %[[V3:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<9xf32>
 //       CHECK: %[[I3:.*]] = vector.insert %[[V3]], %[[I2]] [3] : vector<9xf32> into vector<4x9xf32>
 //       CHECK: return %[[I3]] : vector<4x9xf32>
 func.func @transfer_read_2d(%A : tensor<?x?xf32>, %base1 : index, %base2 : index)
@@ -23,9 +23,9 @@
 
 // CHECK-LABEL: func @transfer_write_2d(
 //       CHECK:   %[[V0:.*]] = vector.extract %{{.*}}[0] : vector<2x3xf32>
-//       CHECK:   %[[T0:.*]] = vector.transfer_write %[[V0]], %{{.*}}[{{.*}}] {in_bounds = [true]} : vector<3xf32>, tensor<?x?xf32>
+//       CHECK:   %[[T0:.*]] = vector.transfer_write %[[V0]], %{{.*}}[{{.*}}] {in_bounds = [true, true]} : vector<3xf32>, tensor<?x?xf32>
 //       CHECK:   %[[V1:.*]] = vector.extract %{{.*}}[1] : vector<2x3xf32>
-//       CHECK:   %[[T1:.*]] = vector.transfer_write %[[V1]], %[[T0]][{{.*}}] {in_bounds = [true]} : vector<3xf32>, tensor<?x?xf32>
+//       CHECK:   %[[T1:.*]] = vector.transfer_write %[[V1]], %[[T0]][{{.*}}] {in_bounds = [true, true]} : vector<3xf32>, tensor<?x?xf32>
 //       CHECK:   return %[[T1]] : tensor<?x?xf32>
 func.func @transfer_write_2d(%A : tensor<?x?xf32>, %vec : vector<2x3xf32>,
                         %base1 : index, %base2 : index) -> (tensor<?x?xf32>) {
diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-scf-mask-and-permutation-map.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-scf-mask-and-permutation-map.mlir
--- a/mlir/test/Conversion/VectorToSCF/vector-to-scf-mask-and-permutation-map.mlir
+++ b/mlir/test/Conversion/VectorToSCF/vector-to-scf-mask-and-permutation-map.mlir
@@ -12,7 +12,7 @@
 //       CHECK:   scf.for {{.*}} {
 //       CHECK:     scf.if {{.*}} {
 //       CHECK:       %[[MASK_LOADED:.*]] = memref.load %[[MASK_CASTED]][%{{.*}}] : memref<4xvector<9xi1>>
-//       CHECK:       %[[READ:.*]] = vector.transfer_read %{{.*}}, %{{.*}}, %[[MASK_LOADED]] : memref<?x?xf32>, vector<9xf32>
+//       CHECK:       %[[READ:.*]] = vector.transfer_read %{{.*}}, %{{.*}}, %[[MASK_LOADED]] {in_bounds = [true, false]} : memref<?x?xf32>, vector<9xf32>
 //       CHECK:       memref.store %[[READ]], %{{.*}} : memref<4xvector<9xf32>>
 //       CHECK:     }
 //       CHECK:   }
diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
--- a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
+++ b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
@@ -26,13 +26,13 @@
   %A = memref.alloc () : memref<7x42xf32>
   affine.for %i0 = 0 to 7 step 4 {
     affine.for %i1 = 0 to 42 step 4 {
-      %f1 = vector.transfer_read %A[%i0, %i1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
+      %f1 = vector.transfer_read %A[%i0, %i1], %f0 {in_bounds = [false, true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
       %ip1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i1)
-      %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
+      %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {in_bounds = [false, true],permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
       %ip2 = affine.apply affine_map<(d0) -> (d0 + 2)> (%i1)
-      %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
+      %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {in_bounds = [false, true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
       %ip3 = affine.apply affine_map<(d0) -> (d0 + 3)> (%i1)
-      %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
+      %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {in_bounds = [false, true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
       // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds.
       // CHECK: scf.if
       // CHECK-NEXT: memref.load
@@ -59,9 +59,9 @@
       affine.for %i2 = 0 to %dyn2 {
         affine.for %i3 = 0 to 42 step 2 {
           affine.for %i4 = 0 to %dyn4 {
-            %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
+            %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {in_bounds = [true, true, true, false, true], permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
             %i3p1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i3)
-            %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
+            %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {in_bounds = [true, true, true, false, true], permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
             // Add a dummy use to prevent dead code elimination from removing
             // transfer read ops.
             "dummy_use"(%f1, %f2) : (vector<4xf32>, vector<4xf32>) -> ()
@@ -132,7 +132,7 @@
     affine.for %i1 = 0 to %N {
       affine.for %i2 = 0 to %O {
         affine.for %i3 = 0 to %P step 5 {
-          %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref<?x?x?x?xf32>, vector<5x4x3xf32>
+          %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {in_bounds = [false, true, true, false], permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref<?x?x?x?xf32>, vector<5x4x3xf32>
           // Add a dummy use to prevent dead code elimination from removing
           // transfer read ops.
           "dummy_use"(%f) : (vector<5x4x3xf32>) -> ()
@@ -173,7 +173,7 @@
   // CHECK:                      scf.for %[[I6:.*]] = %[[C0]] to %[[C1]] step %[[C1]] {
   // CHECK:                        %[[S0:.*]] = affine.apply #[[$ADD]](%[[I2]], %[[I6]])
   // CHECK:                        %[[VEC:.*]] = memref.load %[[VECTOR_VIEW3]][%[[I4]], %[[I5]], %[[I6]]] : memref<3x4x1xvector<5xf32>>
-  // CHECK:                        vector.transfer_write %[[VEC]], %{{.*}}[%[[S3]], %[[S1]], %[[S0]], %[[I3]]] : vector<5xf32>, memref<?x?x?x?xf32>
+  // CHECK:                        vector.transfer_write %[[VEC]], %{{.*}}[%[[S3]], %[[S1]], %[[S0]], %[[I3]]] {in_bounds = [true, true, true, false]} : vector<5xf32>, memref<?x?x?x?xf32>
   // CHECK:                      }
   // CHECK:                    }
   // CHECK:                  }
@@ -195,7 +195,7 @@
     affine.for %i1 = 0 to %N step 4 {
       affine.for %i2 = 0 to %O {
         affine.for %i3 = 0 to %P step 5 {
-          vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d1, d0)>} : vector<5x4x3xf32>, memref<?x?x?x?xf32>
+          vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] {in_bounds = [false, false, true, false], permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d1, d0)>} : vector<5x4x3xf32>, memref<?x?x?x?xf32>
         }
       }
     }
@@ -233,7 +233,7 @@
   // CHECK:       %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
   // CHECK:       %[[cond1:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index
   // CHECK:       scf.if %[[cond1]] {
-  // CHECK:         %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
+  // CHECK:         %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] {in_bounds = [true, false]} : memref<?x?xf32>, vector<15xf32>
   // CHECK:         memref.store %[[vec_1d]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
   // CHECK:       } else {
   // CHECK:         store %[[splat]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
@@ -247,7 +247,7 @@
   // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
   // FULL-UNROLL: cmpi sgt, %[[DIM]], %[[base]] : index
   // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
-  // FULL-UNROLL:   vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
+  // FULL-UNROLL:   vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] {in_bounds = [true, false]} : memref<?x?xf32>, vector<15xf32>
   // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32>
   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
   // FULL-UNROLL: } else {
@@ -256,7 +256,7 @@
   // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]]
   // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
   // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
-  // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
+  // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] {in_bounds = [true, false]} : memref<?x?xf32>, vector<15xf32>
   // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32>
   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
   // FULL-UNROLL: } else {
@@ -265,7 +265,7 @@
   // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]]
   // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
   // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
-  // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
+  // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] {in_bounds = [true, false]} : memref<?x?xf32>, vector<15xf32>
   // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32>
   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
   // FULL-UNROLL: } else {
@@ -306,7 +306,7 @@
   // CHECK:       %[[cmp:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index
   // CHECK:       scf.if %[[cmp]] {
   // CHECK:         %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
-  // CHECK:         vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
+  // CHECK:         vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] {in_bounds = [true, false]} : vector<15xf32>, memref<?x?xf32>
   // CHECK:       }
   // CHECK:     }
 
@@ -315,19 +315,19 @@
   // FULL-UNROLL: %[[CMP0:.*]] = arith.cmpi sgt, %[[DIM]], %[[base]] : index
   // FULL-UNROLL: scf.if %[[CMP0]] {
   // FULL-UNROLL:   %[[V0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32>
-  // FULL-UNROLL:   vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] : vector<15xf32>, memref<?x?xf32>
+  // FULL-UNROLL:   vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true, false]} : vector<15xf32>, memref<?x?xf32>
   // FULL-UNROLL: }
   // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]]
   // FULL-UNROLL: %[[CMP1:.*]] = arith.cmpi sgt, %{{.*}}, %[[I1]] : index
   // FULL-UNROLL: scf.if %[[CMP1]] {
   // FULL-UNROLL:   %[[V1:.*]] = vector.extract %[[vec]][1] : vector<3x15xf32>
-  // FULL-UNROLL:   vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
+  // FULL-UNROLL:   vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] {in_bounds = [true, false]} : vector<15xf32>, memref<?x?xf32>
   // FULL-UNROLL: }
   // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]]
   // FULL-UNROLL: %[[CMP2:.*]] = arith.cmpi sgt, %{{.*}}, %[[I2]] : index
   // FULL-UNROLL: scf.if %[[CMP2]] {
   // FULL-UNROLL:   %[[V2:.*]] = vector.extract %[[vec]][2] : vector<3x15xf32>
-  // FULL-UNROLL:   vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
+  // FULL-UNROLL:   vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] {in_bounds = [true, false]} : vector<15xf32>, memref<?x?xf32>
   // FULL-UNROLL: }
 
   vector.transfer_write %vec, %A[%base, %base] :
@@ -360,16 +360,16 @@
   // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
   // CHECK-NEXT:   %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
   // CHECK-NEXT:   %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
-  // CHECK-NEXT:   vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
+  // CHECK-NEXT:   vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true, true]} : vector<15xf32>, memref<?x?xf32>
 
   // FULL-UNROLL: %[[VEC0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32>
-  // FULL-UNROLL: vector.transfer_write %[[VEC0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
+  // FULL-UNROLL: vector.transfer_write %[[VEC0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true, true]} : vector<15xf32>, memref<?x?xf32>
   // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]]
   // FULL-UNROLL: %[[VEC1:.*]] = vector.extract %[[vec]][1] : vector<3x15xf32>
-  // FULL-UNROLL: vector.transfer_write %2, %[[A]][%[[I1]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
+  // FULL-UNROLL: vector.transfer_write %2, %[[A]][%[[I1]], %[[base]]] {in_bounds = [true, true]} : vector<15xf32>, memref<?x?xf32>
   // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]]
   // FULL-UNROLL: %[[VEC2:.*]] = vector.extract %[[vec]][2] : vector<3x15xf32>
-  // FULL-UNROLL: vector.transfer_write %[[VEC2:.*]], %[[A]][%[[I2]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
+  // FULL-UNROLL: vector.transfer_write %[[VEC2:.*]], %[[A]][%[[I2]], %[[base]]] {in_bounds = [true, true]} : vector<15xf32>, memref<?x?xf32>
   vector.transfer_write %vec, %A[%base, %base] {in_bounds = [true, true]} :
     vector<3x15xf32>, memref<?x?xf32>
   return
@@ -396,7 +396,8 @@
   %c0 = arith.constant 0 : index
   %f0 = arith.constant 0.0 : f32
   %0 = vector.transfer_read %A[%c0, %c0, %c0, %c0], %f0
-    { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> }
+    { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+      in_bounds = [true, true, false, false] }
       : memref<?x?x?x?xf32>, vector<3x3xf32>
   return %0 : vector<3x3xf32>
 }
@@ -415,7 +416,7 @@
 //  CHECK:          %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref<?x?x?x?xf32>
 //  CHECK:          %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg1]] : index
 //  CHECK:          scf.if %[[cmp]] {
-//  CHECK:            %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref<?x?x?x?xf32>, vector<3xf32>
+//  CHECK:            %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] {in_bounds = [true, true, true, false]} : memref<?x?x?x?xf32>, vector<3xf32>
 //  CHECK:            memref.store %[[tr]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
 //  CHECK:          } else {
 //  CHECK:            memref.store %[[cst0]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
@@ -428,7 +429,8 @@
   %c0 = arith.constant 0 : index
   %f0 = arith.constant 0.0 : f32
   vector.transfer_write %A, %B[%c0, %c0, %c0, %c0]
-    { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> }
+    { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+      in_bounds = [true, true, false, false] }
       : vector<3x3xf32>, memref<?x?x?x?xf32>
   return
 }
@@ -448,7 +450,7 @@
 // CHECK:           %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg2]] : index
 // CHECK:           scf.if %[[cmp]] {
 // CHECK:             %[[tmp:.*]] = memref.load %[[cast]][%[[arg2]]] : memref<3xvector<3xf32>>
-// CHECK:             vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref<?x?x?x?xf32>
+// CHECK:             vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] {in_bounds = [true, true, true, false]} : vector<3xf32>, memref<?x?x?x?xf32>
 // CHECK:           }
 // CHECK:         }
 // CHECK:         return
@@ -459,7 +461,7 @@
 func.func @transfer_read_strided(%A : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) -> vector<4xf32> {
   %c0 = arith.constant 0 : index
   %f0 = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %A[%c0, %c0], %f0
+  %0 = vector.transfer_read %A[%c0, %c0], %f0 {in_bounds = [true, false]}
       : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>, vector<4xf32>
   return %0 : vector<4xf32>
 }
@@ -470,7 +472,7 @@
 
 func.func @transfer_write_strided(%A : vector<4xf32>, %B : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) {
   %c0 = arith.constant 0 : index
-  vector.transfer_write %A, %B[%c0, %c0] :
+  vector.transfer_write %A, %B[%c0, %c0] {in_bounds = [true, false]} :
     vector<4xf32>, memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>
   return
 }
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
--- a/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
@@ -56,7 +56,7 @@
 // VECNEST:         vector.transfer_read
 // VECNEST-NEXT:    affine.for %{{.*}} = 0 to 30 {
 // VECNEST:           vector.transfer_read
-// VECNEST-NEXT:      vector.transfer_write %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] {permutation_map = #{{.*}}}
+// VECNEST-NEXT:      vector.transfer_write %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] {in_bounds = [false, true], permutation_map = #{{.*}}}
 // VECNEST-NEXT:    }
 // VECNEST-NEXT:    vector.transfer_write
 // VECNEST:       }
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
@@ -22,7 +22,7 @@
 // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
 // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
 // CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32
-// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true], permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector
      %a0 = affine.load %A[%c0, %c0] : memref<?x?xf32>
    }
@@ -48,7 +48,7 @@
 
 // CHECK:for [[IV3:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
 // CHECK-NEXT: %[[CST:.*]] = arith.constant 0.0{{.*}}: f32
-// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %[[CST]] : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %[[CST]] {in_bounds = [true, false]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i3 = 0 to %M { // vectorized
      %a3 = affine.load %A[%c0, %i3] : memref<?x?xf32>
    }
@@ -77,7 +77,7 @@
 // CHECK-NEXT:   %[[APP9_0:[0-9a-zA-Z_]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
 // CHECK-NEXT:   %[[APP9_1:[0-9a-zA-Z_]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
 // CHECK-NEXT:   %[[CST:.*]] = arith.constant 0.0{{.*}}: f32
-// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %[[CST]] : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %[[CST]] {in_bounds = [true, false]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i8 = 0 to %M { // vectorized
      affine.for %i9 = 0 to %N {
        %a9 = affine.load %A[%i9, %i8 + %i9] : memref<?x?xf32>
@@ -98,7 +98,7 @@
   affine.for %i0 = 0 to %M {
     affine.for %i1 = 0 to %N {
       // CHECK: %[[C1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32>
-      // CHECK: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?x?xf32>
+      // CHECK: vector.transfer_write %[[C1]], {{.*}} {in_bounds = [true, false]} : vector<128xf32>, memref<?x?xf32>
       // non-scoped %f1
       affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
     }
@@ -106,7 +106,7 @@
   affine.for %i2 = 0 to %M {
     affine.for %i3 = 0 to %N {
       // CHECK: %[[C3:.*]] = arith.constant dense<2.000000e+00> : vector<128xf32>
-      // CHECK: vector.transfer_write %[[C3]], {{.*}} : vector<128xf32>, memref<?x?xf32>
+      // CHECK: vector.transfer_write %[[C3]], {{.*}} {in_bounds = [true, false]} : vector<128xf32>, memref<?x?xf32>
       // non-scoped %f2
       affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
     }
@@ -115,13 +115,13 @@
     affine.for %i5 = 0 to %N {
       // CHECK: %[[SPLAT2:.*]] = arith.constant dense<2.000000e+00> : vector<128xf32>
       // CHECK: %[[SPLAT1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32>
-      // CHECK: %[[A5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
-      // CHECK: %[[B5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
+      // CHECK: %[[A5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} {in_bounds = [true, false]} : memref<?x?xf32>, vector<128xf32>
+      // CHECK: %[[B5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} {in_bounds = [true, false]} : memref<?x?xf32>, vector<128xf32>
       // CHECK: %[[S5:.*]] = arith.addf %[[A5]], %[[B5]] : vector<128xf32>
       // CHECK: %[[S6:.*]] = arith.addf %[[S5]], %[[SPLAT1]] : vector<128xf32>
       // CHECK: %[[S7:.*]] = arith.addf %[[S5]], %[[SPLAT2]] : vector<128xf32>
       // CHECK: %[[S8:.*]] = arith.addf %[[S7]], %[[S6]] : vector<128xf32>
-      // CHECK: vector.transfer_write %[[S8]], {{.*}} : vector<128xf32>, memref<?x?xf32>
+      // CHECK: vector.transfer_write %[[S8]], {{.*}} {in_bounds = [true, false]} : vector<128xf32>, memref<?x?xf32>
       %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
       %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
       %s5 = arith.addf %a5, %b5 : f32
@@ -150,7 +150,7 @@
   affine.for %i0 = 0 to %M { // vectorized
     // CHECK:      %[[C1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32>
     // CHECK-NEXT: affine.for
-    // CHECK-NEXT:   vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?x?xf32>
+    // CHECK-NEXT:   vector.transfer_write %[[C1]], {{.*}} {in_bounds = [true, false]} : vector<128xf32>, memref<?x?xf32>
     affine.for %i1 = 0 to %N {
       affine.store %f1, %A[%i1, %i0] : memref<?x?xf32, 0>
     }
@@ -171,7 +171,7 @@
   // CHECK-NEXT:   affine.for %[[IV1:[0-9a-zA-Z_]+]] = 0 to 32 {
   // CHECK-NEXT:     %[[BROADCAST:.*]] = vector.broadcast %[[IV1]] : index to vector<128xindex>
   // CHECK-NEXT:     %[[CAST:.*]] = arith.index_cast %[[BROADCAST]] : vector<128xindex> to vector<128xi32>
-  // CHECK-NEXT:     vector.transfer_write %[[CAST]], {{.*}}[%[[IV1]], %[[IV0]]] : vector<128xi32>, memref<32x512xi32>
+  // CHECK-NEXT:     vector.transfer_write %[[CAST]], {{.*}}[%[[IV1]], %[[IV0]]] {in_bounds = [true, false]} : vector<128xi32>, memref<32x512xi32>
   affine.for %i = 0 to 512 {  // vectorized
     affine.for %j = 0 to 32 {
       %idx = arith.index_cast %j : index to i32
@@ -281,7 +281,7 @@
 // CHECK:for [[IV4:%[0-9a-zA-Z_]+]] = 0 to [[ARG_M]] step 128 {
 // CHECK-NEXT:   for [[IV5:%[0-9a-zA-Z_]*]] = 0 to [[ARG_N]] {
 // CHECK-NEXT:     %{{.*}} = arith.constant 0.0{{.*}}: f32
-// CHECK-NEXT:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{[a-zA-Z0-9_]*}} {in_bounds = [true, false]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i4 = 0 to %M { // vectorized
      affine.for %i5 = 0 to %N { // not vectorized, would vectorize with --test-fastest-varying=1
        %a5 = affine.load %A[%i5, %i4] : memref<?x?xf32>
@@ -425,7 +425,7 @@
 // CHECK:     %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
 // CHECK:     %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
 // CHECK:     %{{.*}} = arith.constant 0.0{{.*}}: f32
-// CHECK:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true], permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}}
      affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
        %a18 = affine.load %A[%c0, %c0] : memref<?x?xf32>
@@ -459,7 +459,7 @@
 // CHECK:      %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
 // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
 // CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32
-// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true], permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}}
      affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
        %a18 = affine.load %A[%c0, %c0] : memref<?x?xf32>
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
@@ -123,8 +123,8 @@
   //      VECT:  affine.for %[[I2:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[M]]) step 4 {
   // VECT-NEXT:    affine.for %[[I3:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[N]]) step 8 {
   // VECT-NEXT:      affine.for %[[I4:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[K]]) {
-  //      VECT:        %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
-  //      VECT:        %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
+  //      VECT:        %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {in_bounds = [true, false], permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
+  //      VECT:        %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {in_bounds = [false, true], permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
   // VECT-NEXT:        %[[C:.*]] = arith.mulf %[[B]], %[[A]] : vector<4x8xf32>
   //      VECT:        %[[D:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I3]]], %{{.*}} : memref<?x?xf32>, vector<4x8xf32>
   // VECT-NEXT:        %[[E:.*]] = arith.addf %[[D]], %[[C]] : vector<4x8xf32>
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir
@@ -13,7 +13,7 @@
   // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32
   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256
-  // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d0d2]]} : memref<?x?x?xf32>,  vector<32x256xf32>
+  // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [false, true, false], permutation_map = #[[map_proj_d0d1d2_d0d2]]} : memref<?x?x?xf32>,  vector<32x256xf32>
   affine.for %i0 = 0 to %M {
     affine.for %i1 = 0 to %N {
       affine.for %i2 = 0 to %P {
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir
@@ -25,7 +25,7 @@
   // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32
   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} step 256
   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
-  // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+  // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [false, true, false], permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
   affine.for %i3 = 0 to %M {
     affine.for %i4 = 0 to %N {
       affine.for %i5 = 0 to %P {
@@ -46,12 +46,12 @@
   // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 {
   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256 {
-  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [false, true, false], permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} step 256 {
   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
-  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [false, true, false], permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
-  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [false, true, false], permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
   affine.for %i0 = 0 to %0 {
     affine.for %i1 = 0 to %1 {
       affine.for %i2 = 0 to %2 {
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir
@@ -25,7 +25,7 @@
   // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32
   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256
-  // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+  // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, false, false], permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
   affine.for %i3 = 0 to %M {
     affine.for %i4 = 0 to %N {
       affine.for %i5 = 0 to %P {
@@ -46,12 +46,12 @@
   // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 {
   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} step 256 {
   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
-  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, false, false], permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256 {
-  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, false, false], permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256 {
-  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+  // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, false, false], permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
   affine.for %i0 = 0 to %0 {
     affine.for %i1 = 0 to %1 {
       affine.for %i2 = 0 to %2 {
diff --git a/mlir/test/Dialect/Linalg/hoisting.mlir b/mlir/test/Dialect/Linalg/hoisting.mlir
--- a/mlir/test/Dialect/Linalg/hoisting.mlir
+++ b/mlir/test/Dialect/Linalg/hoisting.mlir
@@ -46,13 +46,13 @@
 // CHECK: "unrelated_use"(%[[MEMREF1]]) : (memref<?x?xf32>) -> ()
   scf.for %i = %lb to %ub step %step {
     scf.for %j = %lb to %ub step %step {
-      %r0 = vector.transfer_read %memref1[%c0, %c0], %cst: memref<?x?xf32>, vector<1xf32>
-      %r1 = vector.transfer_read %memref0[%i, %i], %cst: memref<?x?xf32>, vector<2xf32>
-      %r2 = vector.transfer_read %memref2[%c0, %c0], %cst: memref<?x?xf32>, vector<3xf32>
-      %r3 = vector.transfer_read %memref3[%c0, %c0], %cst: memref<?x?xf32>, vector<4xf32>
+      %r0 = vector.transfer_read %memref1[%c0, %c0], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<1xf32>
+      %r1 = vector.transfer_read %memref0[%i, %i], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<2xf32>
+      %r2 = vector.transfer_read %memref2[%c0, %c0], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<3xf32>
+      %r3 = vector.transfer_read %memref3[%c0, %c0], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<4xf32>
       "some_crippling_use"(%memref4) : (memref<?x?xf32>) -> ()
-      %r4 = vector.transfer_read %memref4[%c0, %c0], %cst: memref<?x?xf32>, vector<5xf32>
-      %r5 = vector.transfer_read %memref5[%c0, %c0], %cst: memref<?x?xf32>, vector<6xf32>
+      %r4 = vector.transfer_read %memref4[%c0, %c0], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<5xf32>
+      %r5 = vector.transfer_read %memref5[%c0, %c0], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<6xf32>
       "some_crippling_use"(%memref5) : (memref<?x?xf32>) -> ()
       %u0 = "some_use"(%r0) : (vector<1xf32>) -> vector<1xf32>
       %u1 = "some_use"(%r1) : (vector<2xf32>) -> vector<2xf32>
@@ -60,12 +60,12 @@
       %u3 = "some_use"(%r3) : (vector<4xf32>) -> vector<4xf32>
       %u4 = "some_use"(%r4) : (vector<5xf32>) -> vector<5xf32>
       %u5 = "some_use"(%r5) : (vector<6xf32>) -> vector<6xf32>
-      vector.transfer_write %u0, %memref1[%c0, %c0] : vector<1xf32>, memref<?x?xf32>
-      vector.transfer_write %u1, %memref0[%i, %i] : vector<2xf32>, memref<?x?xf32>
-      vector.transfer_write %u2, %memref2[%c0, %c0] : vector<3xf32>, memref<?x?xf32>
-      vector.transfer_write %u3, %memref3[%c0, %c0] : vector<4xf32>, memref<?x?xf32>
-      vector.transfer_write %u4, %memref4[%c0, %c0] : vector<5xf32>, memref<?x?xf32>
-      vector.transfer_write %u5, %memref5[%c0, %c0] : vector<6xf32>, memref<?x?xf32>
+      vector.transfer_write %u0, %memref1[%c0, %c0] {in_bounds = [true, false]} : vector<1xf32>, memref<?x?xf32>
+      vector.transfer_write %u1, %memref0[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, memref<?x?xf32>
+      vector.transfer_write %u2, %memref2[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, memref<?x?xf32>
+      vector.transfer_write %u3, %memref3[%c0, %c0] {in_bounds = [true, false]} : vector<4xf32>, memref<?x?xf32>
+      vector.transfer_write %u4, %memref4[%c0, %c0] {in_bounds = [true, false]} : vector<5xf32>, memref<?x?xf32>
+      vector.transfer_write %u5, %memref5[%c0, %c0] {in_bounds = [true, false]} : vector<6xf32>, memref<?x?xf32>
       "some_crippling_use"(%memref3) : (memref<?x?xf32>) -> ()
     }
     "unrelated_use"(%memref0) : (memref<?x?xf32>) -> ()
@@ -134,14 +134,14 @@
 // CHECK: vector.transfer_write %{{.*}}, %[[MEMREF2]]{{.*}} : vector<3xf32>, memref<?x?xf32>
   scf.for %i = %lb to %ub step %step {
     scf.for %j = %lb to %ub step %step {
-      %r00 = vector.transfer_read %memref1[%c0, %c0], %cst: memref<?x?xf32>, vector<2xf32>
-      %r01 = vector.transfer_read %memref1[%c0, %c1], %cst: memref<?x?xf32>, vector<2xf32>
-      %r20 = vector.transfer_read %memref2[%c0, %c0], %cst: memref<?x?xf32>, vector<3xf32>
-      %r21 = vector.transfer_read %memref2[%c0, %c3], %cst: memref<?x?xf32>, vector<3xf32>
-      %r30 = vector.transfer_read %memref3[%c0, %random_index], %cst: memref<?x?xf32>, vector<4xf32>
-      %r31 = vector.transfer_read %memref3[%c1, %random_index], %cst: memref<?x?xf32>, vector<4xf32>
-      %r10 = vector.transfer_read %memref0[%i, %i], %cst: memref<?x?xf32>, vector<2xf32>
-      %r11 = vector.transfer_read %memref0[%random_index, %random_index], %cst: memref<?x?xf32>, vector<2xf32>
+      %r00 = vector.transfer_read %memref1[%c0, %c0], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<2xf32>
+      %r01 = vector.transfer_read %memref1[%c0, %c1], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<2xf32>
+      %r20 = vector.transfer_read %memref2[%c0, %c0], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<3xf32>
+      %r21 = vector.transfer_read %memref2[%c0, %c3], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<3xf32>
+      %r30 = vector.transfer_read %memref3[%c0, %random_index], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<4xf32>
+      %r31 = vector.transfer_read %memref3[%c1, %random_index], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<4xf32>
+      %r10 = vector.transfer_read %memref0[%i, %i], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<2xf32>
+      %r11 = vector.transfer_read %memref0[%random_index, %random_index], %cst {in_bounds = [true, false]} : memref<?x?xf32>, vector<2xf32>
       %u00 = "some_use"(%r00) : (vector<2xf32>) -> vector<2xf32>
       %u01 = "some_use"(%r01) : (vector<2xf32>) -> vector<2xf32>
       %u20 = "some_use"(%r20) : (vector<3xf32>) -> vector<3xf32>
@@ -150,14 +150,14 @@
       %u31 = "some_use"(%r31) : (vector<4xf32>) -> vector<4xf32>
       %u10 = "some_use"(%r10) : (vector<2xf32>) -> vector<2xf32>
       %u11 = "some_use"(%r11) : (vector<2xf32>) -> vector<2xf32>
-      vector.transfer_write %u00, %memref1[%c0, %c0] : vector<2xf32>, memref<?x?xf32>
-      vector.transfer_write %u01, %memref1[%c0, %c1] : vector<2xf32>, memref<?x?xf32>
-      vector.transfer_write %u20, %memref2[%c0, %c0] : vector<3xf32>, memref<?x?xf32>
-      vector.transfer_write %u21, %memref2[%c0, %c3] : vector<3xf32>, memref<?x?xf32>
-      vector.transfer_write %u30, %memref3[%c0, %random_index] : vector<4xf32>, memref<?x?xf32>
-      vector.transfer_write %u31, %memref3[%c1, %random_index] : vector<4xf32>, memref<?x?xf32>
-      vector.transfer_write %u10, %memref0[%i, %i] : vector<2xf32>, memref<?x?xf32>
-      vector.transfer_write %u11, %memref0[%random_index, %random_index] : vector<2xf32>, memref<?x?xf32>
+      vector.transfer_write %u00, %memref1[%c0, %c0] {in_bounds = [true, false]} : vector<2xf32>, memref<?x?xf32>
+      vector.transfer_write %u01, %memref1[%c0, %c1] {in_bounds = [true, false]} : vector<2xf32>, memref<?x?xf32>
+      vector.transfer_write %u20, %memref2[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, memref<?x?xf32>
+      vector.transfer_write %u21, %memref2[%c0, %c3] {in_bounds = [true, false]} : vector<3xf32>, memref<?x?xf32>
+      vector.transfer_write %u30, %memref3[%c0, %random_index] {in_bounds = [true, false]} : vector<4xf32>, memref<?x?xf32>
+      vector.transfer_write %u31, %memref3[%c1, %random_index] {in_bounds = [true, false]} : vector<4xf32>, memref<?x?xf32>
+      vector.transfer_write %u10, %memref0[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, memref<?x?xf32>
+      vector.transfer_write %u11, %memref0[%random_index, %random_index] {in_bounds = [true, false]} : vector<2xf32>, memref<?x?xf32>
     }
   }
   return
@@ -180,7 +180,7 @@
 //       CHECK:   %[[C0:.*]] = arith.constant 0 : i32
 //       CHECK:   affine.for %[[I:.*]] = 0 to 64 {
 //       CHECK:     affine.for %[[J:.*]] = 0 to 64 step 16 {
-//       CHECK:       %[[R0:.*]] = vector.transfer_read %[[MEMREF2]][%[[I]], %[[J]]], %[[C0]] : memref<64x64xi32>, vector<16xi32>
+//       CHECK:       %[[R0:.*]] = vector.transfer_read %[[MEMREF2]][%[[I]], %[[J]]], %[[C0]] {{.*}} : memref<64x64xi32>, vector<16xi32>
 //       CHECK:       %[[R:.*]] = affine.for %[[K:.*]] = 0 to 64 iter_args(%[[ACC:.*]] = %[[R0]]) -> (vector<16xi32>) {
 //       CHECK:         %[[AV:.*]] = vector.transfer_read %[[MEMREF0]][%[[I]], %[[K]]], %[[C0]] {{.*}}: memref<64x64xi32>, vector<16xi32>
 //       CHECK:         %[[BV:.*]] = vector.transfer_read %[[MEMREF1]][%[[K]], %[[J]]], %[[C0]] {{.*}}: memref<64x64xi32>, vector<16xi32>
@@ -188,7 +188,7 @@
 //       CHECK:         %[[T1:.*]] = arith.addi %[[ACC]], %[[T0]] : vector<16xi32>
 //       CHECK:         affine.yield %[[T1]] : vector<16xi32>
 //       CHECK:       }
-//       CHECK:       vector.transfer_write %[[R]], %[[MEMREF2]][%[[I]], %[[J]]] : vector<16xi32>, memref<64x64xi32>
+//       CHECK:       vector.transfer_write %[[R]], %[[MEMREF2]][%[[I]], %[[J]]] {{.*}} : vector<16xi32>, memref<64x64xi32>
 //       CHECK:     }
 //       CHECK:   }
 func.func @hoist_vector_transfer_pairs_in_affine_loops(%memref0: memref<64x64xi32>, %memref1: memref<64x64xi32>, %memref2: memref<64x64xi32>) {
@@ -196,12 +196,12 @@
   affine.for %arg3 = 0 to 64 {
     affine.for %arg4 = 0 to 64 step 16 {
       affine.for %arg5 = 0 to 64 {
-        %0 = vector.transfer_read %memref0[%arg3, %arg5], %c0_i32 {permutation_map = affine_map<(d0, d1) -> (0)>} : memref<64x64xi32>, vector<16xi32>
-        %1 = vector.transfer_read %memref1[%arg5, %arg4], %c0_i32 : memref<64x64xi32>, vector<16xi32>
-        %2 = vector.transfer_read %memref2[%arg3, %arg4], %c0_i32 : memref<64x64xi32>, vector<16xi32>
+        %0 = vector.transfer_read %memref0[%arg3, %arg5], %c0_i32 {permutation_map = affine_map<(d0, d1) -> (0)>, in_bounds = [true, true]} : memref<64x64xi32>, vector<16xi32>
+        %1 = vector.transfer_read %memref1[%arg5, %arg4], %c0_i32 {in_bounds = [true, false]} : memref<64x64xi32>, vector<16xi32>
+        %2 = vector.transfer_read %memref2[%arg3, %arg4], %c0_i32 {in_bounds = [true, false]} : memref<64x64xi32>, vector<16xi32>
         %3 = arith.muli %0, %1 : vector<16xi32>
         %4 = arith.addi %2, %3 : vector<16xi32>
-        vector.transfer_write %4, %memref2[%arg3, %arg4] : vector<16xi32>, memref<64x64xi32>
+        vector.transfer_write %4, %memref2[%arg3, %arg4] {in_bounds = [true, false]} : vector<16xi32>, memref<64x64xi32>
       }
     }
   }
@@ -264,12 +264,12 @@
               %arg9 = %arg3,  %arg10 = %arg4, %arg11 = %arg5)
     -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
        tensor<?x?xf32>, tensor<?x?xf32>)  {
-      %r0 = vector.transfer_read %arg7[%c0, %c0], %cst: tensor<?x?xf32>, vector<1xf32>
-      %r1 = vector.transfer_read %arg6[%i, %i], %cst: tensor<?x?xf32>, vector<2xf32>
-      %r3 = vector.transfer_read %arg9[%c0, %c0], %cst: tensor<?x?xf32>, vector<4xf32>
+      %r0 = vector.transfer_read %arg7[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<1xf32>
+      %r1 = vector.transfer_read %arg6[%i, %i], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<2xf32>
+      %r3 = vector.transfer_read %arg9[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<4xf32>
       "some_crippling_use"(%arg10) : (tensor<?x?xf32>) -> ()
-      %r4 = vector.transfer_read %arg10[%c0, %c0], %cst: tensor<?x?xf32>, vector<5xf32>
-      %r5 = vector.transfer_read %arg11[%c0, %c0], %cst: tensor<?x?xf32>, vector<6xf32>
+      %r4 = vector.transfer_read %arg10[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<5xf32>
+      %r5 = vector.transfer_read %arg11[%c0, %c0], %cst{in_bounds = [true, false]} : tensor<?x?xf32>, vector<6xf32>
       "some_crippling_use"(%arg11) : (tensor<?x?xf32>) -> ()
       %u0 = "some_use"(%r0) : (vector<1xf32>) -> vector<1xf32>
       %u1 = "some_use"(%r1) : (vector<2xf32>) -> vector<2xf32>
@@ -277,12 +277,12 @@
       %u3 = "some_use"(%r3) : (vector<4xf32>) -> vector<4xf32>
       %u4 = "some_use"(%r4) : (vector<5xf32>) -> vector<5xf32>
       %u5 = "some_use"(%r5) : (vector<6xf32>) -> vector<6xf32>
-      %w1 = vector.transfer_write %u0, %arg7[%c0, %c0] : vector<1xf32>, tensor<?x?xf32>
-      %w0 = vector.transfer_write %u1, %arg6[%i, %i] : vector<2xf32>, tensor<?x?xf32>
-      %w2 = vector.transfer_write %u2, %arg8[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
-      %w3 = vector.transfer_write %u3, %arg9[%c0, %c0] : vector<4xf32>, tensor<?x?xf32>
-      %w4 = vector.transfer_write %u4, %arg10[%c0, %c0] : vector<5xf32>, tensor<?x?xf32>
-      %w5 = vector.transfer_write %u5, %arg11[%c0, %c0] : vector<6xf32>, tensor<?x?xf32>
+      %w1 = vector.transfer_write %u0, %arg7[%c0, %c0] {in_bounds = [true, false]} : vector<1xf32>, tensor<?x?xf32>
+      %w0 = vector.transfer_write %u1, %arg6[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, tensor<?x?xf32>
+      %w2 = vector.transfer_write %u2, %arg8[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, tensor<?x?xf32>
+      %w3 = vector.transfer_write %u3, %arg9[%c0, %c0] {in_bounds = [true, false]} : vector<4xf32>, tensor<?x?xf32>
+      %w4 = vector.transfer_write %u4, %arg10[%c0, %c0] {in_bounds = [true, false]} : vector<5xf32>, tensor<?x?xf32>
+      %w5 = vector.transfer_write %u5, %arg11[%c0, %c0] {in_bounds = [true, false]} : vector<6xf32>, tensor<?x?xf32>
       "some_crippling_use"(%w3) : (tensor<?x?xf32>) -> ()
       scf.yield %w0, %w1, %w2, %w3, %w4, %w5 :
         tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
@@ -361,14 +361,14 @@
     iter_args(%arg4 = %arg0, %arg5 = %arg1, %arg6 = %arg2,
               %arg7 = %arg3)
     -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) {
-      %r00 = vector.transfer_read %arg5[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
-      %r01 = vector.transfer_read %arg5[%c0, %c1], %cst: tensor<?x?xf32>, vector<2xf32>
-      %r20 = vector.transfer_read %arg6[%c0, %c0], %cst: tensor<?x?xf32>, vector<3xf32>
-      %r21 = vector.transfer_read %arg6[%c0, %c3], %cst: tensor<?x?xf32>, vector<3xf32>
-      %r30 = vector.transfer_read %arg7[%c0, %random_index], %cst: tensor<?x?xf32>, vector<4xf32>
-      %r31 = vector.transfer_read %arg7[%c1, %random_index], %cst: tensor<?x?xf32>, vector<4xf32>
-      %r10 = vector.transfer_read %arg4[%i, %i], %cst: tensor<?x?xf32>, vector<2xf32>
-      %r11 = vector.transfer_read %arg4[%random_index, %random_index], %cst: tensor<?x?xf32>, vector<2xf32>
+      %r00 = vector.transfer_read %arg5[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<2xf32>
+      %r01 = vector.transfer_read %arg5[%c0, %c1], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<2xf32>
+      %r20 = vector.transfer_read %arg6[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<3xf32>
+      %r21 = vector.transfer_read %arg6[%c0, %c3], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<3xf32>
+      %r30 = vector.transfer_read %arg7[%c0, %random_index], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<4xf32>
+      %r31 = vector.transfer_read %arg7[%c1, %random_index], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<4xf32>
+      %r10 = vector.transfer_read %arg4[%i, %i], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<2xf32>
+      %r11 = vector.transfer_read %arg4[%random_index, %random_index], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<2xf32>
       %u00 = "some_use"(%r00) : (vector<2xf32>) -> vector<2xf32>
       %u01 = "some_use"(%r01) : (vector<2xf32>) -> vector<2xf32>
       %u20 = "some_use"(%r20) : (vector<3xf32>) -> vector<3xf32>
@@ -377,14 +377,14 @@
       %u31 = "some_use"(%r31) : (vector<4xf32>) -> vector<4xf32>
       %u10 = "some_use"(%r10) : (vector<2xf32>) -> vector<2xf32>
       %u11 = "some_use"(%r11) : (vector<2xf32>) -> vector<2xf32>
-      %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] : vector<2xf32>, tensor<?x?xf32>
-      %w11 = vector.transfer_write %u01, %w10[%c0, %c1] : vector<2xf32>, tensor<?x?xf32>
-      %w20 = vector.transfer_write %u20, %arg6[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
-      %w21 = vector.transfer_write %u21, %w20[%c0, %c3] : vector<3xf32>, tensor<?x?xf32>
-      %w30 = vector.transfer_write %u30, %arg7[%c0, %random_index] : vector<4xf32>, tensor<?x?xf32>
-      %w31 = vector.transfer_write %u31, %w30[%c1, %random_index] : vector<4xf32>, tensor<?x?xf32>
-      %w00 = vector.transfer_write %u10, %arg4[%i, %i] : vector<2xf32>, tensor<?x?xf32>
-      %w01 = vector.transfer_write %u11, %w00[%random_index, %random_index] : vector<2xf32>, tensor<?x?xf32>
+      %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] {in_bounds = [true, false]} : vector<2xf32>, tensor<?x?xf32>
+      %w11 = vector.transfer_write %u01, %w10[%c0, %c1] {in_bounds = [true, false]} : vector<2xf32>, tensor<?x?xf32>
+      %w20 = vector.transfer_write %u20, %arg6[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, tensor<?x?xf32>
+      %w21 = vector.transfer_write %u21, %w20[%c0, %c3] {in_bounds = [true, false]} : vector<3xf32>, tensor<?x?xf32>
+      %w30 = vector.transfer_write %u30, %arg7[%c0, %random_index] {in_bounds = [true, false]} : vector<4xf32>, tensor<?x?xf32>
+      %w31 = vector.transfer_write %u31, %w30[%c1, %random_index] {in_bounds = [true, false]} : vector<4xf32>, tensor<?x?xf32>
+      %w00 = vector.transfer_write %u10, %arg4[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, tensor<?x?xf32>
+      %w01 = vector.transfer_write %u11, %w00[%random_index, %random_index] {in_bounds = [true, false]} : vector<2xf32>, tensor<?x?xf32>
       scf.yield %w01, %w11, %w21, %w31 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
     }
     scf.yield %1#0,  %1#1, %1#2, %1#3 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
@@ -444,19 +444,19 @@
     -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>)  {
       // Hoists.
       %st0 = tensor.extract_slice %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-      %r0 = vector.transfer_read %st0[%c0, %c0], %cst: tensor<?x?xf32>, vector<1xf32>
+      %r0 = vector.transfer_read %st0[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<1xf32>
 
       // CHECK:     %[[ST1:.*]] = tensor.extract_slice %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
       // CHECK:     %[[V1:.*]] = vector.transfer_read %[[ST1]]{{.*}} : tensor<?x?xf32>, vector<2xf32>
       // Does not hoist (slice depends on %j)
       %st1 = tensor.extract_slice %arg7[%j, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-      %r1 = vector.transfer_read %st1[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
+      %r1 = vector.transfer_read %st1[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<2xf32>
 
       // CHECK:     %[[ST2:.*]] = tensor.extract_slice %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
       // CHECK:     %[[V2:.*]] = vector.transfer_read %[[ST2]]{{.*}} : tensor<?x?xf32>, vector<3xf32>
       // Does not hoist, 2 slice %arg8.
       %st2 = tensor.extract_slice %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-      %r2 = vector.transfer_read %st2[%c0, %c0], %cst: tensor<?x?xf32>, vector<3xf32>
+      %r2 = vector.transfer_read %st2[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<3xf32>
 
       // CHECK:     %[[U0:.*]] = "some_use"(%[[V0_ARG_L2]]) : (vector<1xf32>) -> vector<1xf32>
       // CHECK:     %[[U1:.*]] = "some_use"(%[[V1]]) : (vector<2xf32>) -> vector<2xf32>
@@ -466,15 +466,15 @@
       %u2 = "some_use"(%r2) : (vector<3xf32>) -> vector<3xf32>
 
       // Hoists
-      %w0 = vector.transfer_write %u0, %st0[%c0, %c0] : vector<1xf32>, tensor<?x?xf32>
+      %w0 = vector.transfer_write %u0, %st0[%c0, %c0] {in_bounds = [true, false]} : vector<1xf32>, tensor<?x?xf32>
 
       // CHECK-DAG:     %[[STI1:.*]] = vector.transfer_write %[[U1]], %{{.*}} : vector<2xf32>, tensor<?x?xf32>
       // Does not hoist (associated slice depends on %j).
-      %w1 = vector.transfer_write %u1, %st1[%i, %i] : vector<2xf32>, tensor<?x?xf32>
+      %w1 = vector.transfer_write %u1, %st1[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, tensor<?x?xf32>
 
       // CHECK-DAG:     %[[STI2:.*]] = vector.transfer_write %[[U2]], %{{.*}} : vector<3xf32>, tensor<?x?xf32>
       // Does not hoist, 2 slice / insert_slice for %arg8.
-      %w2 = vector.transfer_write %u2, %st2[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
+      %w2 = vector.transfer_write %u2, %st2[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, tensor<?x?xf32>
 
       // Hoists.
       %sti0 = tensor.insert_slice %w0 into %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
@@ -530,8 +530,8 @@
 //       CHECK:     %[[R5:.*]] = "some_use"(%[[R3]]) : (vector<2xf32>) -> vector<2xf32>
 //       CHECK:     scf.yield %[[R5]], %[[R4]] : vector<2xf32>, vector<2xf32>
 //       CHECK:   }
-//       CHECK:   %[[W0:.*]] = vector.transfer_write %[[F]]#1, %[[T]][%[[C0]], %[[C0]]] : vector<2xf32>, tensor<?x?xf32>
-//       CHECK:   %[[W1:.*]] = vector.transfer_write %[[F]]#0, %[[W0]][%[[C0]], %[[C3]]] : vector<2xf32>, tensor<?x?xf32>
+//       CHECK:   %[[W0:.*]] = vector.transfer_write %[[F]]#1, %[[T]][%[[C0]], %[[C0]]] {{.*}} : vector<2xf32>, tensor<?x?xf32>
+//       CHECK:   %[[W1:.*]] = vector.transfer_write %[[F]]#0, %[[W0]][%[[C0]], %[[C3]]] {{.*}} : vector<2xf32>, tensor<?x?xf32>
 //       CHECK:  return %[[W1]] : tensor<?x?xf32>
 func.func @hoist_vector_transfer_write_pairs_disjoint_tensor(
     %tensor: tensor<?x?xf32>,
@@ -543,14 +543,14 @@
   %cst = arith.constant 0.0 : f32
   %1 = scf.for %j = %lb to %ub step %step iter_args(%arg5 = %tensor)
     -> (tensor<?x?xf32>) {
-    %r00 = vector.transfer_read %arg5[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
+    %r00 = vector.transfer_read %arg5[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<2xf32>
     %u00 = "some_use"(%r00) : (vector<2xf32>) -> vector<2xf32>
-    %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] : vector<2xf32>, tensor<?x?xf32>
+    %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] {in_bounds = [true, false]} : vector<2xf32>, tensor<?x?xf32>
 
     // Hoist by properly bypassing the disjoint write %w10.
-    %r01 = vector.transfer_read %w10[%c0, %c3], %cst: tensor<?x?xf32>, vector<2xf32>
+    %r01 = vector.transfer_read %w10[%c0, %c3], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<2xf32>
     %u01 = "some_use"(%r01) : (vector<2xf32>) -> vector<2xf32>
-    %w11 = vector.transfer_write %u01, %w10[%c0, %c3] : vector<2xf32>, tensor<?x?xf32>
+    %w11 = vector.transfer_write %u01, %w10[%c0, %c3] {in_bounds = [true, false]} : vector<2xf32>, tensor<?x?xf32>
     scf.yield %w11 : tensor<?x?xf32>
   }
   return %1 : tensor<?x?xf32>
@@ -604,19 +604,19 @@
     -> (tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>)  {
       // Hoists.
       %st0 = tensor.extract_slice %arg6[%i, %i][%step, %step][1, 1] : tensor<100x100xf32> to tensor<?x?xf32>
-      %r0 = vector.transfer_read %st0[%c0, %c0], %cst: tensor<?x?xf32>, vector<1xf32>
+      %r0 = vector.transfer_read %st0[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<1xf32>
 
       // CHECK:     %[[ST1:.*]] = tensor.extract_slice %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<200x200xf32> to tensor<?x?xf32>
       // CHECK:     %[[V1:.*]] = vector.transfer_read %[[ST1]]{{.*}} : tensor<?x?xf32>, vector<2xf32>
       // Does not hoist (slice depends on %j)
       %st1 = tensor.extract_slice %arg7[%j, %c0][%step, %step][1, 1] : tensor<200x200xf32> to tensor<?x?xf32>
-      %r1 = vector.transfer_read %st1[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
+      %r1 = vector.transfer_read %st1[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<2xf32>
 
       // CHECK:     %[[ST2:.*]] = tensor.extract_slice %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<300x300xf32> to tensor<?x?xf32>
       // CHECK:     %[[V2:.*]] = vector.transfer_read %[[ST2]]{{.*}} : tensor<?x?xf32>, vector<3xf32>
       // Does not hoist, 2 slice %arg8.
       %st2 = tensor.extract_slice %arg8[%i, %c0][%step, %step][1, 1] : tensor<300x300xf32> to tensor<?x?xf32>
-      %r2 = vector.transfer_read %st2[%c0, %c0], %cst: tensor<?x?xf32>, vector<3xf32>
+      %r2 = vector.transfer_read %st2[%c0, %c0], %cst {in_bounds = [true, false]} : tensor<?x?xf32>, vector<3xf32>
 
       // CHECK:     %[[U0:.*]] = "some_use"(%[[V0_ARG_L2]]) : (vector<1xf32>) -> vector<1xf32>
       // CHECK:     %[[U1:.*]] = "some_use"(%[[V1]]) : (vector<2xf32>) -> vector<2xf32>
@@ -626,15 +626,15 @@
       %u2 = "some_use"(%r2) : (vector<3xf32>) -> vector<3xf32>
 
       // Hoists
-      %w0 = vector.transfer_write %u0, %st0[%c0, %c0] : vector<1xf32>, tensor<?x?xf32>
+      %w0 = vector.transfer_write %u0, %st0[%c0, %c0] {in_bounds = [true, false]} : vector<1xf32>, tensor<?x?xf32>
 
       // CHECK-DAG:     %[[STI1:.*]] = vector.transfer_write %[[U1]], %{{.*}} : vector<2xf32>, tensor<?x?xf32>
       // Does not hoist (associated slice depends on %j).
-      %w1 = vector.transfer_write %u1, %st1[%i, %i] : vector<2xf32>, tensor<?x?xf32>
+      %w1 = vector.transfer_write %u1, %st1[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, tensor<?x?xf32>
 
       // CHECK-DAG:     %[[STI2:.*]] = vector.transfer_write %[[U2]], %{{.*}} : vector<3xf32>, tensor<?x?xf32>
       // Does not hoist, 2 slice / insert_slice for %arg8.
-      %w2 = vector.transfer_write %u2, %st2[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
+      %w2 = vector.transfer_write %u2, %st2[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, tensor<?x?xf32>
 
       // Hoists.
       %sti0 = tensor.insert_slice %w0 into %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> into tensor<100x100xf32>
diff --git a/mlir/test/Dialect/Linalg/vectorization-masked.mlir b/mlir/test/Dialect/Linalg/vectorization-masked.mlir
--- a/mlir/test/Dialect/Linalg/vectorization-masked.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-masked.mlir
@@ -53,7 +53,7 @@
 // CHECK-LABEL:   @vectorize_dynamic_1d_broadcast
 // CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_4:.*]] = tensor.dim %{{.*}}, %[[VAL_3]] : tensor<?xf32>
-// CHECK:           %[[VAL_7:.*]] = vector.transfer_read %{{.*}} {permutation_map = #{{.*}}} : tensor<?xf32>, vector<4xf32>
+// CHECK:           %[[VAL_7:.*]] = vector.transfer_read %{{.*}} {in_bounds = [true], permutation_map = #{{.*}}} : tensor<?xf32>, vector<4xf32>
 // CHECK:           %[[VAL_9:.*]] = vector.create_mask %[[VAL_4]] : vector<4xi1>
 // CHECK:           %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
 // CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
@@ -464,10 +464,10 @@
 // CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[VAL_11:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_8]] : vector<8x4xi1>
-// CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_11]] { vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_9]], %[[VAL_9]]], %[[VAL_10]] {in_bounds = [true, true, true], permutation_map = #map} : memref<?x?xf32>, vector<8x16x4xf32> } : vector<8x4xi1> -> vector<8x16x4xf32>
+// CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_11]] { vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_9]], %[[VAL_9]]], %[[VAL_10]] {in_bounds = [true, true], permutation_map = #map} : memref<?x?xf32>, vector<8x16x4xf32> } : vector<8x4xi1> -> vector<8x16x4xf32>
 // CHECK:           %[[VAL_13:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[VAL_14:.*]] = vector.create_mask %[[VAL_8]], %[[VAL_6]] : vector<4x16xi1>
-// CHECK:           %[[VAL_15:.*]] = vector.mask %[[VAL_14]] { vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_9]], %[[VAL_9]]], %[[VAL_13]] {in_bounds = [true, true, true], permutation_map = #map1} : memref<?x?xf32>, vector<8x16x4xf32> } : vector<4x16xi1> -> vector<8x16x4xf32>
+// CHECK:           %[[VAL_15:.*]] = vector.mask %[[VAL_14]] { vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_9]], %[[VAL_9]]], %[[VAL_13]] {in_bounds = [true, true], permutation_map = #map1} : memref<?x?xf32>, vector<8x16x4xf32> } : vector<4x16xi1> -> vector<8x16x4xf32>
 // CHECK:           %[[VAL_16:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[VAL_17:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]] : vector<8x16xi1>
 // CHECK:           %[[VAL_18:.*]] = vector.mask %[[VAL_17]] { vector.transfer_read %[[VAL_2]]{{\[}}%[[VAL_9]], %[[VAL_9]]], %[[VAL_16]] {in_bounds = [true, true]} : memref<?x?xf32>, vector<8x16xf32> } : vector<8x16xi1> -> vector<8x16xf32>
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -663,10 +663,10 @@
 //     CHECK: func @generic_vectorize_broadcast_transpose
 // CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:   %[[CF:.*]] = arith.constant 0.000000e+00 : f32
-//     CHECK:   %[[V0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CF]] {in_bounds = [true, true, true, true], permutation_map = #[[$MAP0]]} : memref<4x4xf32>, vector<4x4x4x4xf32>
-//     CHECK:   %[[V1:.*]] = vector.transfer_read %{{.*}}[%[[C0]]], %[[CF]] {in_bounds = [true, true, true, true], permutation_map = #[[$MAP1]]} : memref<4xf32>, vector<4x4x4x4xf32>
-//     CHECK:   %[[V2:.*]] = vector.transfer_read %{{.*}}[%[[C0]]], %[[CF]] {in_bounds = [true, true, true, true], permutation_map = #[[$MAP2]]} : memref<4xf32>, vector<4x4x4x4xf32>
-//     CHECK:   %[[V3:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CF]] {in_bounds = [true, true, true, true], permutation_map = #[[$MAP3]]} : memref<4x4xf32>, vector<4x4x4x4xf32>
+//     CHECK:   %[[V0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CF]] {in_bounds = [true, true], permutation_map = #[[$MAP0]]} : memref<4x4xf32>, vector<4x4x4x4xf32>
+//     CHECK:   %[[V1:.*]] = vector.transfer_read %{{.*}}[%[[C0]]], %[[CF]] {in_bounds = [true], permutation_map = #[[$MAP1]]} : memref<4xf32>, vector<4x4x4x4xf32>
+//     CHECK:   %[[V2:.*]] = vector.transfer_read %{{.*}}[%[[C0]]], %[[CF]] {in_bounds = [true], permutation_map = #[[$MAP2]]} : memref<4xf32>, vector<4x4x4x4xf32>
+//     CHECK:   %[[V3:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CF]] {in_bounds = [true, true], permutation_map = #[[$MAP3]]} : memref<4x4xf32>, vector<4x4x4x4xf32>
 //     CHECK:   %[[SUB:.*]] = arith.subf %[[V0]], %[[V1]] : vector<4x4x4x4xf32>
 //     CHECK:   %[[ADD0:.*]] = arith.addf %[[V2]], %[[SUB]] : vector<4x4x4x4xf32>
 //     CHECK:   %[[ADD1:.*]] = arith.addf %[[V3]], %[[ADD0]] : vector<4x4x4x4xf32>
@@ -715,8 +715,8 @@
 // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (0, d1, 0, d0)>
 // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d2, d1, d3, d0)>
 //       CHECK: func @vectorization_transpose
-//       CHECK: vector.transfer_read {{.*}}{in_bounds = [true, true, true, true], permutation_map = #[[MAP0]]} : memref<14x7xf32>, vector<7x14x8x16xf32>
-//       CHECK: vector.transfer_read {{.*}}{in_bounds = [true, true, true, true], permutation_map = #[[MAP1]]} : memref<16x14xf32>, vector<7x14x8x16xf32>
+//       CHECK: vector.transfer_read {{.*}}{in_bounds = [true, true], permutation_map = #[[MAP0]]} : memref<14x7xf32>, vector<7x14x8x16xf32>
+//       CHECK: vector.transfer_read {{.*}}{in_bounds = [true, true], permutation_map = #[[MAP1]]} : memref<16x14xf32>, vector<7x14x8x16xf32>
 //       CHECK: vector.transfer_read {{.*}}{in_bounds = [true, true, true, true], permutation_map = #[[MAP2]]} : memref<16x14x7x8xf32>, vector<7x14x8x16xf32>
 //       CHECK: arith.addf {{.*}} : vector<7x14x8x16xf32>
 //       CHECK: arith.addf {{.*}} : vector<7x14x8x16xf32>
@@ -1132,8 +1132,8 @@
 func.func @sum_exp_2(%input: tensor<3x2xf32>, %input_2: tensor<5x4xf32>, %output: tensor<5x2xf32>)
   -> tensor<5x2xf32>
 {
-  // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true, true, true], permutation_map = #[[$M1]]} : tensor<3x2xf32>, vector<2x3x4x5xf32>
-  // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true, true, true], permutation_map = #[[$M2]]} : tensor<5x4xf32>, vector<2x3x4x5xf32>
+  // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true], permutation_map = #[[$M1]]} : tensor<3x2xf32>, vector<2x3x4x5xf32>
+  // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true], permutation_map = #[[$M2]]} : tensor<5x4xf32>, vector<2x3x4x5xf32>
   // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true], permutation_map = #[[$M3]]} : tensor<5x2xf32>, vector<2x5xf32>
   // CHECK: math.exp {{.*}} : vector<2x3x4x5xf32>
   // CHECK: math.exp {{.*}} : vector<2x3x4x5xf32>
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -149,7 +149,7 @@
 // CHECK:           %[[VAL_17:.*]] = vector.shape_cast %[[VAL_12]] : vector<1x4xindex> to vector<4xindex>
 // CHECK:           %[[VAL_18:.*]] = vector.extractelement %[[VAL_17]]{{\[}}%[[VAL_7]] : i32] : vector<4xindex>
 // CHECK:           %[[VAL_19:.*]] = vector.extractelement %[[VAL_16]]{{\[}}%[[VAL_7]] : i32] : vector<4xindex>
-// CHECK:           %[[VAL_20:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_18]], %[[VAL_10]], %[[VAL_19]]], %[[VAL_8]] {in_bounds = [true, true]} : tensor<45x80x16xf32>, vector<1x4xf32>
+// CHECK:           %[[VAL_20:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_18]], %[[VAL_10]], %[[VAL_19]]], %[[VAL_8]] {in_bounds = [true, true, true]} : tensor<45x80x16xf32>, vector<1x4xf32>
 // CHECK:           %[[VAL_21:.*]] = vector.transfer_write %[[VAL_20]], %[[VAL_5]]{{\[}}%[[VAL_9]], %[[VAL_9]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
 // CHECK:           return %[[VAL_21]] : tensor<1x4xf32>
 // CHECK:         }
diff --git a/mlir/test/Dialect/MemRef/extract-address-computations.mlir b/mlir/test/Dialect/MemRef/extract-address-computations.mlir
--- a/mlir/test/Dialect/MemRef/extract-address-computations.mlir
+++ b/mlir/test/Dialect/MemRef/extract-address-computations.mlir
@@ -267,13 +267,13 @@
 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG: %[[CF0:.*]] = arith.constant 0.0{{0*e\+00}} : f16
 // CHECK-DAG: %[[SUBVIEW:.*]] = memref.subview %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]] [%[[DYN_SIZE0]], %[[DYN_SIZE1]], %[[DYN_SIZE2]]] [1, 1, 1] : memref<?x?x?xf16> to memref<?x?x?xf16, strided<[?, ?, 1], offset: ?>>
-// CHECK: %[[LOADED_VAL:.*]] = vector.transfer_read %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]], %[[CF0]] {permutation_map = #[[$PERMUTATION_MAP]]} : memref<?x?x?xf16, strided<[?, ?, 1], offset: ?>>, vector<4x2xf16>
+// CHECK: %[[LOADED_VAL:.*]] = vector.transfer_read %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]], %[[CF0]] {in_bounds = [false, true, false], permutation_map = #[[$PERMUTATION_MAP]]} : memref<?x?x?xf16, strided<[?, ?, 1], offset: ?>>, vector<4x2xf16>
 // CHECK: return %[[LOADED_VAL]] : vector<4x2xf16>
 func.func @test_transfer_read_op(%base : memref<?x?x?xf16>,
     %offset0 : index, %offset1: index, %offset2: index)
     -> vector<4x2xf16> {
   %cf0 = arith.constant 0.0 : f16
-  %loaded_val = vector.transfer_read %base[%offset0, %offset1, %offset2], %cf0 { permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : memref<?x?x?xf16>, vector<4x2xf16>
+  %loaded_val = vector.transfer_read %base[%offset0, %offset1, %offset2], %cf0 { in_bounds = [false, true, false], permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : memref<?x?x?xf16>, vector<4x2xf16>
   return %loaded_val : vector<4x2xf16>
 }
 
@@ -297,13 +297,13 @@
 // CHECK-SAME: %[[DYN_OFFSET1:[^:]*]]: index,
 // CHECK-SAME: %[[DYN_OFFSET2:[^:]*]]: index)
 // CHECK: %[[CF0:.*]] = arith.constant 0.0{{0*e\+00}} : f16
-// CHECK: %[[LOADED_VAL:.*]] = vector.transfer_read %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]], %[[CF0]] {permutation_map = #[[$PERMUTATION_MAP]]} : tensor<?x?x?xf16>, vector<4x2xf16>
+// CHECK: %[[LOADED_VAL:.*]] = vector.transfer_read %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]], %[[CF0]] {in_bounds = [false, true, false], permutation_map = #[[$PERMUTATION_MAP]]} : tensor<?x?x?xf16>, vector<4x2xf16>
 // CHECK: return %[[LOADED_VAL]] : vector<4x2xf16>
 func.func @test_transfer_read_op_with_tensor(%base : tensor<?x?x?xf16>,
     %offset0 : index, %offset1: index, %offset2: index)
     -> vector<4x2xf16> {
   %cf0 = arith.constant 0.0 : f16
-  %loaded_val = vector.transfer_read %base[%offset0, %offset1, %offset2], %cf0 { permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : tensor<?x?x?xf16>, vector<4x2xf16>
+  %loaded_val = vector.transfer_read %base[%offset0, %offset1, %offset2], %cf0 { in_bounds = [false, true, false], permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : tensor<?x?x?xf16>, vector<4x2xf16>
   return %loaded_val : vector<4x2xf16>
 }
 
@@ -334,12 +334,12 @@
 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG: %[[VCF0:.*]] = arith.constant dense<0.0{{0*e\+00}}> : vector<4x2xf16>
 // CHECK-DAG: %[[SUBVIEW:.*]] = memref.subview %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]] [%[[DYN_SIZE0]], %[[DYN_SIZE1]], %[[DYN_SIZE2]]] [1, 1, 1] : memref<?x?x?xf16> to memref<?x?x?xf16, strided<[?, ?, 1], offset: ?>>
-// CHECK: vector.transfer_write %[[VCF0]], %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]] {permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, memref<?x?x?xf16, strided<[?, ?, 1], offset: ?>>
+// CHECK: vector.transfer_write %[[VCF0]], %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [false, true, false], permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, memref<?x?x?xf16, strided<[?, ?, 1], offset: ?>>
 // CHECK: return
 func.func @test_transfer_write_op(%base : memref<?x?x?xf16>,
     %offset0 : index, %offset1: index, %offset2: index) {
   %vcf0 = arith.constant dense<0.000000e+00> : vector<4x2xf16>
-  vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, memref<?x?x?xf16>
+  vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { in_bounds = [false, true, false], permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, memref<?x?x?xf16>
   return
 }
 
@@ -371,12 +371,12 @@
 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG: %[[VCF0:.*]] = arith.constant dense<0.0{{0*e\+00}}> : vector<4x2xf16>
 // CHECK-DAG: %[[SUBVIEW:.*]] = memref.subview %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]] [%[[DYN_SIZE0]], %[[DYN_SIZE1]], %[[DYN_SIZE2]]] [1, 1, 1] : memref<?x?x?xf16, strided<[329, 26, 12], offset: ?>> to memref<?x?x?xf16, strided<[329, 26, 12], offset: ?>>
-// CHECK: vector.transfer_write %[[VCF0]], %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]] {permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, memref<?x?x?xf16, strided<[329, 26, 12], offset: ?>>
+// CHECK: vector.transfer_write %[[VCF0]], %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [false, true, false], permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, memref<?x?x?xf16, strided<[329, 26, 12], offset: ?>>
 // CHECK: return
 func.func @test_transfer_write_op_with_strides(%base : memref<?x?x?xf16, strided<[329, 26, 12], offset: ?>>,
     %offset0 : index, %offset1: index, %offset2: index) {
   %vcf0 = arith.constant dense<0.000000e+00> : vector<4x2xf16>
-  vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, memref<?x?x?xf16, strided<[329, 26, 12], offset: ?>>
+  vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { in_bounds = [false, true, false], permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, memref<?x?x?xf16, strided<[329, 26, 12], offset: ?>>
   return
 }
 
@@ -400,12 +400,12 @@
 // CHECK-SAME: %[[DYN_OFFSET1:[^:]*]]: index,
 // CHECK-SAME: %[[DYN_OFFSET2:[^:]*]]: index)
 // CHECK-DAG: %[[VCF0:.*]] = arith.constant dense<0.0{{0*e\+00}}> : vector<4x2xf16>
-// CHECK: %[[RES:.*]] = vector.transfer_write %[[VCF0]], %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]] {permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, tensor<?x?x?xf16>
+// CHECK: %[[RES:.*]] = vector.transfer_write %[[VCF0]], %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]] {in_bounds = [false, true, false], permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, tensor<?x?x?xf16>
 // CHECK: return %[[RES]] : tensor<?x?x?xf16>
 func.func @test_transfer_write_op_with_tensor(%base : tensor<?x?x?xf16>,
     %offset0 : index, %offset1: index, %offset2: index) -> tensor<?x?x?xf16> {
   %vcf0 = arith.constant dense<0.000000e+00> : vector<4x2xf16>
-  %res = vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, tensor<?x?x?xf16>
+  %res = vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { in_bounds = [false, true, false], permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, tensor<?x?x?xf16>
   return %res : tensor<?x?x?xf16>
 }
 
diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
--- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
+++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
@@ -102,7 +102,7 @@
   %f1 = arith.constant 1.0 : f32
 
   %0 = memref.subview %arg0[%arg1, %arg2][4, 4][%arg5, %arg6] : memref<12x32xf32> to memref<4x4xf32, strided<[?, ?], offset: ?>>
-  %1 = vector.transfer_read %0[%arg3, %arg4], %f1 {in_bounds = [true]} : memref<4x4xf32, strided<[?, ?], offset: ?>>, vector<4xf32>
+  %1 = vector.transfer_read %0[%arg3, %arg4], %f1 {in_bounds = [true, true]} : memref<4x4xf32, strided<[?, ?], offset: ?>>, vector<4xf32>
   return %1 : vector<4xf32>
 }
 //      CHECK: func @fold_subview_with_transfer_read
@@ -132,7 +132,7 @@
 func.func @fold_static_stride_subview_with_transfer_write(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5: index, %arg6 : index, %arg7 : vector<4xf32>) {
   %0 = memref.subview %arg0[%arg1, %arg2][4, 4][%arg5, %arg6] :
     memref<12x32xf32> to memref<4x4xf32, strided<[?, ?], offset: ?>>
-  vector.transfer_write %arg7, %0[%arg3, %arg4] {in_bounds = [true]} : vector<4xf32>, memref<4x4xf32, strided<[?, ?], offset: ?>>
+  vector.transfer_write %arg7, %0[%arg3, %arg4] {in_bounds = [true, true]} : vector<4xf32>, memref<4x4xf32, strided<[?, ?], offset: ?>>
   return
 }
 //      CHECK: func @fold_static_stride_subview_with_transfer_write
@@ -186,7 +186,7 @@
   %0 = memref.subview %arg0[0, %arg1, %arg2] [1, %arg3, %arg4] [1, 1, 1]
       : memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>> to
         memref<?x?xf32, strided<[?, ?], offset: ?>>
-  %1 = vector.transfer_read %0[%arg5, %arg6], %cst {in_bounds = [true]}
+  %1 = vector.transfer_read %0[%arg5, %arg6], %cst {in_bounds = [true, true]}
       : memref<?x?xf32, strided<[?, ?], offset: ?>>, vector<4xf32>
   return %1 : vector<4xf32>
 }
@@ -214,7 +214,7 @@
   %0 = memref.subview %arg0[0, %arg2, %arg3] [1, %arg4, %arg5] [1, 1, 1]
       : memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>> to
         memref<?x?xf32, strided<[?, ?], offset: ?>>
-  vector.transfer_write %arg1, %0[%arg6, %arg7] {in_bounds = [true]}
+  vector.transfer_write %arg1, %0[%arg6, %arg7] {in_bounds = [true, true]}
       : vector<4xf32>, memref<?x?xf32, strided<[?, ?], offset: ?>>
   return
 }
@@ -231,7 +231,7 @@
 //   CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:    %[[IDX0:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG6]]]
 //   CHECK-DAG:    %[[IDX1:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG7]]]
-//   CHECK-DAG:    vector.transfer_write %[[ARG1]], %[[ARG0]][%[[C0]], %[[IDX0]], %[[IDX1]]] {in_bounds = [true]} : vector<4xf32>, memref<?x?x?xf32
+//   CHECK-DAG:    vector.transfer_write %[[ARG1]], %[[ARG0]][%[[C0]], %[[IDX0]], %[[IDX1]]] {in_bounds = [true, true, true]} : vector<4xf32>, memref<?x?x?xf32
 
 // -----
 
@@ -243,7 +243,7 @@
   %0 = memref.subview %arg0[%arg2, %arg3, 0] [%arg4, %arg5, 1] [1, 1, 1]
       : memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>> to
         memref<?x?xf32, strided<[?, ?], offset: ?>>
-  vector.transfer_write %arg1, %0[%arg6, %arg7] {in_bounds = [true]}
+  vector.transfer_write %arg1, %0[%arg6, %arg7] {in_bounds = [true, true]}
       : vector<4xf32>, memref<?x?xf32, strided<[?, ?], offset: ?>>
   return
 }
@@ -262,7 +262,7 @@
 //   CHECK-DAG:    %[[IDX0:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG6]]]
 //   CHECK-DAG:    %[[IDX1:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG7]]]
 //   CHECK-DAG:    vector.transfer_write %[[ARG1]], %[[ARG0]][%[[IDX0]], %[[IDX1]], %[[C0]]]
-//  CHECK-SAME:    {in_bounds = [true], permutation_map = #[[MAP2]]} : vector<4xf32>, memref<?x?x?xf32
+//  CHECK-SAME:    {in_bounds = [true, true, true], permutation_map = #[[MAP2]]} : vector<4xf32>, memref<?x?x?xf32
 
 // -----
 
diff --git a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir
--- a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir
+++ b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir
@@ -30,14 +30,14 @@
 //  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
 //   CHECK-DAG:   %[[c8:.*]] = arith.constant 8 : index
 //       CHECK:   %[[add:.*]] = affine.apply #[[$map]]()[%[[s1]]]
-//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true]} : tensor<?x?xf32>, vector<6xf32>
+//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<6xf32>
 //       CHECK:   return %[[r]]
 func.func @transfer_read_of_extract_slice_1d(%t : tensor<?x?xf32>, %s1 : index, %s2 : index) -> vector<6xf32> {
   %c3 = arith.constant 3 : index
   %c4 = arith.constant 4 : index
   %cst = arith.constant 0.0 : f32
   %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor<?x?xf32> to tensor<10x?xf32>
-  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true]} : tensor<10x?xf32>, vector<6xf32>
+  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<10x?xf32>, vector<6xf32>
   return %1 : vector<6xf32>
 }
 
@@ -46,7 +46,7 @@
 //   CHECK-DAG:   %[[c5:.*]] = arith.constant 5 : index
 //   CHECK-DAG:   %[[c10:.*]] = arith.constant 10 : index
 //       CHECK:   %[[add:.*]] = affine.apply #[[$map1]]()[%[[s1]]]
-//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true]} : tensor<?x?x?xf32>, vector<5x6xf32>
+//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true, true]} : tensor<?x?x?xf32>, vector<5x6xf32>
 //       CHECK:   return %[[r]]
 func.func @transfer_read_of_extract_slice_rank_reducing(%t : tensor<?x?x?xf32>, %s1 : index, %s2 : index) -> vector<5x6xf32> {
   %c3 = arith.constant 3 : index
@@ -61,7 +61,7 @@
 //  CHECK-SAME:     %[[t:.*]]: tensor<?x?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
 //   CHECK-DAG:   %[[c8:.*]] = arith.constant 8 : index
 //   CHECK-DAG:   %[[c10:.*]] = arith.constant 10 : index
-//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[s1]], %[[c10]]], %{{.*}} {in_bounds = [true, true], permutation_map = #[[$map2]]} : tensor<?x?x?xf32>, vector<5x6xf32>
+//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[s1]], %[[c10]]], %{{.*}} {in_bounds = [true, true, true], permutation_map = #[[$map2]]} : tensor<?x?x?xf32>, vector<5x6xf32>
 //       CHECK:   return %[[r]]
 func.func @transfer_read_of_extract_slice_non_leading_rank_reduction(%t : tensor<?x?x?xf32>, %s1 : index, %s2 : index) -> vector<5x6xf32> {
   %c3 = arith.constant 3 : index
@@ -88,7 +88,7 @@
 //  CHECK-SAME:     %[[t1:.*]]: tensor<?x?x12xf32>, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index
 //   CHECK-DAG:   %[[c3:.*]] = arith.constant 3 : index
 //   CHECK-DAG:   %[[c4:.*]] = arith.constant 4 : index
-//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true], permutation_map = #[[$map2]]} : vector<5x6xf32>, tensor<?x?x12xf32>
+//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true, true], permutation_map = #[[$map2]]} : vector<5x6xf32>, tensor<?x?x12xf32>
 func.func @insert_slice_of_transfer_write_non_leading_rank_reduction(%t1 : tensor<?x?x12xf32>, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor<?x?x12xf32> {
   %c0 = arith.constant 0 : index
   %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32>
@@ -100,7 +100,7 @@
 //  CHECK-SAME:     %[[t1:.*]]: tensor<?x?x12xf32>, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index
 //   CHECK-DAG:   %[[c3:.*]] = arith.constant 3 : index
 //   CHECK-DAG:   %[[c4:.*]] = arith.constant 4 : index
-//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<?x?x12xf32>
+//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true, true]} : vector<5x6xf32>, tensor<?x?x12xf32>
 //       CHECK:   return %[[r]]
 func.func @insert_slice_of_transfer_write_rank_extending(%t1 : tensor<?x?x12xf32>, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor<?x?x12xf32> {
   %c0 = arith.constant 0 : index
diff --git a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir
--- a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir
+++ b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir
@@ -8,7 +8,7 @@
   %0 = tensor.extract_slice %arg0[0, %arg1, %arg2] [1, %arg3, %arg4] [1, 1, 1]
       : tensor<?x?x?xf32> to
         tensor<?x?xf32>
-  %1 = vector.transfer_read %0[%arg5, %arg6], %cst {in_bounds = [true]}
+  %1 = vector.transfer_read %0[%arg5, %arg6], %cst {in_bounds = [true, true]}
       : tensor<?x?xf32>, vector<4xf32>
   return %1 : vector<4xf32>
 }
@@ -40,7 +40,7 @@
   // Can't fold this atm since we don' emit the proper vector.extract_strided_slice.
 //   CHECK: tensor.extract_slice
   %0 = tensor.extract_slice %src[0, %i1, %i2, %i3] [1, 4, 1, 4] [2, 3, 4, 5] : tensor<1x8x8x8xf32> to tensor<1x4x4xf32>
-  %1 = vector.transfer_read %0[%c1, %i4, %c2], %f0 {in_bounds = [true]} : tensor<1x4x4xf32>, vector<4xf32>
+  %1 = vector.transfer_read %0[%c1, %i4, %c2], %f0 {in_bounds = [true, true, true]} : tensor<1x4x4xf32>, vector<4xf32>
   return %1 : vector<4xf32>
 }
 
@@ -87,14 +87,14 @@
 //  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
 //   CHECK-DAG:   %[[c8:.*]] = arith.constant 8 : index
 //       CHECK:   %[[add:.*]] = affine.apply #[[$ADD_4]]()[%[[s1]]]
-//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true]} : tensor<?x?xf32>, vector<6xf32>
+//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<6xf32>
 //       CHECK:   return %[[r]]
 func.func @transfer_read_of_extract_slice(%t : tensor<?x?xf32>, %s1 : index, %s2 : index) -> vector<6xf32> {
   %c3 = arith.constant 3 : index
   %c4 = arith.constant 4 : index
   %cst = arith.constant 0.0 : f32
   %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor<?x?xf32> to tensor<10x?xf32>
-  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true]} : tensor<10x?xf32>, vector<6xf32>
+  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<10x?xf32>, vector<6xf32>
   return %1 : vector<6xf32>
 }
 
@@ -107,7 +107,7 @@
 //   CHECK-DAG:   %[[c5:.*]] = arith.constant 5 : index
 //   CHECK-DAG:   %[[c10:.*]] = arith.constant 10 : index
 //       CHECK:   %[[add:.*]] = affine.apply #[[$ADD_3]]()[%[[s1]]]
-//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true]} : tensor<?x?x?xf32>, vector<5x6xf32>
+//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true, true]} : tensor<?x?x?xf32>, vector<5x6xf32>
 //       CHECK:   return %[[r]]
 func.func @transfer_read_of_extract_slice_rank_reducing(%t : tensor<?x?x?xf32>, %s1 : index, %s2 : index) -> vector<5x6xf32> {
   %c3 = arith.constant 3 : index
@@ -166,8 +166,8 @@
 //   CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:    %[[IDX0:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG6]]]
 //   CHECK-DAG:    %[[IDX1:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG7]]]
-//   CHECK-DAG:    vector.transfer_write %[[ARG1]], %[[ARG0]][%[[C0]], %[[IDX0]], %[[IDX1]]] {in_bounds = [true]} : vector<4xf32>, tensor<?x?x?xf32
-  %0 = vector.transfer_write %arg1, %st[%arg6, %arg7] {in_bounds = [true]}
+//   CHECK-DAG:    vector.transfer_write %[[ARG1]], %[[ARG0]][%[[C0]], %[[IDX0]], %[[IDX1]]] {in_bounds = [true, true, true]} : vector<4xf32>, tensor<?x?x?xf32
+  %0 = vector.transfer_write %arg1, %st[%arg6, %arg7] {in_bounds = [true, true]}
       : vector<4xf32>, tensor<?x?xf32>
   %1 = tensor.insert_slice %0 into %arg0[0, %arg2, %arg3] [1, %arg4, %arg5] [1, 1, 1]
       : tensor<?x?xf32> into tensor<?x?x?xf32>
@@ -200,8 +200,8 @@
   //   CHECK-DAG:  %[[IDX0:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG6]]]
   //   CHECK-DAG:  %[[IDX1:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG7]]]
   //   CHECK-DAG:  vector.transfer_write %[[ARG1]], %[[ARG0]][%[[IDX0]], %[[IDX1]], %[[C0]]]
-  //  CHECK-SAME:    {in_bounds = [true], permutation_map = #[[MAP2]]} : vector<4xf32>, tensor<?x?x?xf32
-  %0 = vector.transfer_write %arg1, %st[%arg6, %arg7] {in_bounds = [true]}
+  //  CHECK-SAME:    {in_bounds = [true, true, true], permutation_map = #[[MAP2]]} : vector<4xf32>, tensor<?x?x?xf32
+  %0 = vector.transfer_write %arg1, %st[%arg6, %arg7] {in_bounds = [true, true]}
       : vector<4xf32>, tensor<?x?xf32>
   %1 = tensor.insert_slice %0 into %arg0[%arg2, %arg3, 0] [%arg4, %arg5, 1] [1, 1, 1]
       : tensor<?x?xf32> into tensor<?x?x?xf32>
@@ -239,7 +239,7 @@
 //   CHECK-DAG:   %[[c3:.*]] = arith.constant 3 : index
 //   CHECK-DAG:   %[[c4:.*]] = arith.constant 4 : index
 //       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]]
-//  CHECK-SAME:    {in_bounds = [true, true], permutation_map = #[[$d0d2]]} : vector<5x6xf32>, tensor<?x?x12xf32>
+//  CHECK-SAME:    {in_bounds = [true, true, true], permutation_map = #[[$d0d2]]} : vector<5x6xf32>, tensor<?x?x12xf32>
 //       CHECK:   return %[[r]]
   %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32>
   %1 = tensor.insert_slice %0 into %t1[4, 3, %s] [5, 1, 6] [1, 1, 1] : tensor<5x6xf32> into tensor<?x?x12xf32>
@@ -252,7 +252,7 @@
 //  CHECK-SAME:     %[[t1:.*]]: tensor<?x?x12xf32>, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index
 //   CHECK-DAG:   %[[c3:.*]] = arith.constant 3 : index
 //   CHECK-DAG:   %[[c4:.*]] = arith.constant 4 : index
-//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<?x?x12xf32>
+//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true, true]} : vector<5x6xf32>, tensor<?x?x12xf32>
 //       CHECK:   return %[[r]]
 func.func @insert_slice_of_transfer_write_rank_extending(%t1 : tensor<?x?x12xf32>, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor<?x?x12xf32> {
   %c0 = arith.constant 0 : index
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -727,12 +727,16 @@
 // -----
 
 // CHECK-LABEL: fold_vector_transfers
-func.func @fold_vector_transfers(%A: memref<?x8xf32>) -> (vector<4x8xf32>, vector<4x9xf32>) {
+func.func @fold_vector_transfers(%A: memref<?x8xf32>,
+                                 %B: memref<9x8xf32>,
+                                 %idx: index)
+    -> (vector<4x8xf32>, vector<4x9xf32>, vector<1x2xf32>, vector<5xf32>)
+{
   %c0 = arith.constant 0 : index
   %f0 = arith.constant 0.0 : f32
 
-  // CHECK: vector.transfer_read %{{.*}} {in_bounds = [false, true]}
-  %1 = vector.transfer_read %A[%c0, %c0], %f0 : memref<?x8xf32>, vector<4x8xf32>
+  // CHECK: vector.transfer_read %{{.*}} {in_bounds = [true, true]}
+  %1 = vector.transfer_read %A[%c0, %c0], %f0 {in_bounds = [true, true]} : memref<?x8xf32>, vector<4x8xf32>
 
   // CHECK: vector.transfer_write %{{.*}} {in_bounds = [false, true]}
   vector.transfer_write %1, %A[%c0, %c0] : vector<4x8xf32>, memref<?x8xf32>
@@ -747,8 +751,17 @@
   // CHECK-NOT: in_bounds
   vector.transfer_write %2, %A[%c0, %c0] : vector<4x9xf32>, memref<?x8xf32>
 
+  // Cannot infer in_bounds for non-constant offsets or dynamic dim sizes.
+  // CHECK: vector.transfer_read {{.*}}
+  // CHECK-NOT: in_bounds
+  %3 = vector.transfer_read %A[%c0, %idx], %f0 : memref<?x8xf32>, vector<1x2xf32>
+
+  // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true]}
+  %4 = vector.transfer_read %B[%c0, %c0], %f0 {in_bounds = [true, false]} : memref<9x8xf32>, vector<5xf32>
+
   // CHECK: return
-  return %1, %2 : vector<4x8xf32>, vector<4x9xf32>
+  return %1, %2, %3, %4
+      : vector<4x8xf32>, vector<4x9xf32>, vector<1x2xf32>, vector<5xf32>
 }
 
 // -----
@@ -1130,7 +1143,7 @@
   %cf0 = arith.constant 0.0 : f32
   %w0 = vector.transfer_write %v0, %arg0[%c0, %c0] {in_bounds = [true, true]} :
     vector<4x2xf32>, tensor<4x4xf32>
-  %0 = vector.transfer_read %w0[%c0, %c0], %cf0 {in_bounds = [true, true, true],
+  %0 = vector.transfer_read %w0[%c0, %c0], %cf0 {in_bounds = [true, true],
   permutation_map = affine_map<(d0, d1) -> (d0, d1, 0)>} :
     tensor<4x4xf32>, vector<4x2x6xf32>
   return %0 : vector<4x2x6xf32>
@@ -1147,10 +1160,10 @@
   %v0 : vector<4x1xf32>) -> vector<1x100x4x5xf32> {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
-  %w0 = vector.transfer_write %v0, %arg0[%c0, %c0, %c0] {in_bounds = [true, true],
+  %w0 = vector.transfer_write %v0, %arg0[%c0, %c0, %c0] {in_bounds = [true, true, true],
   permutation_map = affine_map<(d0, d1, d2) -> (d2, d1)>} :
     vector<4x1xf32>, tensor<4x4x4xf32>
-  %0 = vector.transfer_read %w0[%c0, %c0, %c0], %cf0 {in_bounds = [true, true, true, true],
+  %0 = vector.transfer_read %w0[%c0, %c0, %c0], %cf0 {in_bounds = [true, true, true],
   permutation_map = affine_map<(d0, d1, d2) -> (d1, 0, d2, 0)>} :
     tensor<4x4x4xf32>, vector<1x100x4x5xf32>
   return %0 : vector<1x100x4x5xf32>
@@ -1230,7 +1243,7 @@
   //  CHECK-SAME:                 [%[[IV]], 16] [%[[SZ]], 8]
   //       CHECK:   %[[T1:.*]] = vector.transfer_write %[[VEC]]
   //  CHECK-SAME:                 %[[T0]][%[[C0]], %[[C0]]]
-  //  CHECK-SAME:                 in_bounds = [true, false]
+  //  CHECK-SAME:                 in_bounds = [false, true]
   //  CHECK-SAME:                 permutation_map = #[[$MAP]]
   //       CHECK:   %[[T2:.*]] = tensor.insert_slice %[[T1]] into %[[ITER_ARG]]
   //  CHECK-SAME:                 [%[[IV]], 16] [%[[SZ]], 8]
@@ -2088,7 +2101,7 @@
   %c0 = arith.constant 0 : index
   %f0 = arith.constant 0.000000e+00 : f32
   %0 = tensor.extract_slice %src[0, %i1, %i2, %i3] [1, 4, 1, 4] [1, 1, 1, 1] : tensor<1x8x8x8xf32> to tensor<1x4x4xf32>
-  %1 = vector.transfer_read %0[%c0, %i4, %c0], %f0 {in_bounds = [true]} : tensor<1x4x4xf32>, vector<4xf32>
+  %1 = vector.transfer_read %0[%c0, %i4, %c0], %f0 {in_bounds = [true, true, true]} : tensor<1x4x4xf32>, vector<4xf32>
   return %1 : vector<4xf32>
 }
 
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -383,7 +383,7 @@
   %c3 = arith.constant 3 : index
   %cst = arith.constant 3.0 : f32
   // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
-  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0 + d1)>} : memref<?x?xf32>, vector<128xf32>
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0 + d1)>, in_bounds = [true, true]} : memref<?x?xf32>, vector<128xf32>
 }
 
 // -----
@@ -392,7 +392,7 @@
   %c3 = arith.constant 3 : index
   %cst = arith.constant 3.0 : f32
   // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
-  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0 + 1)>} : memref<?x?xf32>, vector<128xf32>
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0 + 1)>, in_bounds = [true, true]} : memref<?x?xf32>, vector<128xf32>
 }
 
 // -----
@@ -401,7 +401,7 @@
   %c3 = arith.constant 3 : index
   %cst = arith.constant 3.0 : f32
   // expected-error@+1 {{requires a permutation_map that is a permutation (found one dim used more than once)}}
-  %0 = vector.transfer_read %arg0[%c3, %c3, %c3], %cst {permutation_map = affine_map<(d0, d1, d2)->(d0, d0)>} : memref<?x?x?xf32>, vector<3x7xf32>
+  %0 = vector.transfer_read %arg0[%c3, %c3, %c3], %cst {permutation_map = affine_map<(d0, d1, d2)->(d0, d0)>, in_bounds = [true, true, true]} : memref<?x?x?xf32>, vector<3x7xf32>
 }
 
 // -----
@@ -442,22 +442,12 @@
   %c3 = arith.constant 3 : index
   %f0 = arith.constant 0.0 : f32
   %vf0 = vector.splat %f0 : vector<2x3xf32>
-  // expected-error@+1 {{ expects the optional in_bounds attr of same rank as permutation_map results: affine_map<(d0, d1) -> (d0, d1)>}}
+  // expected-error@+1 {{ expects the optional in_bounds attr of same rank as the source type: 2 vs inBounds of size: 1}}
   %0 = vector.transfer_read %arg0[%c3, %c3], %vf0 {in_bounds = [true], permutation_map = affine_map<(d0, d1)->(d0, d1)>} : memref<?x?xvector<2x3xf32>>, vector<1x1x2x3xf32>
 }
 
 // -----
 
-func.func @test_vector.transfer_read(%arg0: memref<?x?xvector<2x3xf32>>) {
-  %c3 = arith.constant 3 : index
-  %f0 = arith.constant 0.0 : f32
-  %vf0 = vector.splat %f0 : vector<2x3xf32>
-  // expected-error@+1 {{requires broadcast dimensions to be in-bounds}}
-  %0 = vector.transfer_read %arg0[%c3, %c3], %vf0 {in_bounds = [false, true], permutation_map = affine_map<(d0, d1)->(0, d1)>} : memref<?x?xvector<2x3xf32>>, vector<1x1x2x3xf32>
-}
-
-// -----
-
 func.func @test_vector.transfer_read(%arg0: memref<?x?xvector<2x3xf32>>) {
   %c3 = arith.constant 3 : index
   %f0 = arith.constant 0.0 : f32
@@ -538,7 +528,7 @@
   %c3 = arith.constant 3 : index
   %cst = arith.constant dense<3.0> : vector<128 x f32>
   // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
-  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0 + d1)>} : vector<128xf32>, memref<?x?xf32>
+  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0 + d1)>, in_bounds = [true, true]} : vector<128xf32>, memref<?x?xf32>
 }
 
 // -----
@@ -547,7 +537,7 @@
   %c3 = arith.constant 3 : index
   %cst = arith.constant dense<3.0> : vector<128 x f32>
   // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
-  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0 + 1)>} : vector<128xf32>, memref<?x?xf32>
+  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0 + 1)>, in_bounds = [true, true]} : vector<128xf32>, memref<?x?xf32>
 }
 
 // -----
@@ -556,7 +546,7 @@
   %c3 = arith.constant 3 : index
   %cst = arith.constant dense<3.0> : vector<3 x 7 x f32>
   // expected-error@+1 {{requires a permutation_map that is a permutation (found one dim used more than once)}}
-  vector.transfer_write %cst, %arg0[%c3, %c3, %c3] {permutation_map = affine_map<(d0, d1, d2)->(d0, d0)>} : vector<3x7xf32>, memref<?x?x?xf32>
+  vector.transfer_write %cst, %arg0[%c3, %c3, %c3] {permutation_map = affine_map<(d0, d1, d2)->(d0, d0)>, in_bounds = [true, true, true]} : vector<3x7xf32>, memref<?x?x?xf32>
 }
 
 // -----
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -20,13 +20,13 @@
   -> tensor<?xf32> {
     %c0 = arith.constant 0 : index
     %f0 = arith.constant 0.0 : f32
-    %0 = vector.transfer_read %arg0[%c0], %f0 {permutation_map = affine_map<(d0)->()>} :
+    %0 = vector.transfer_read %arg0[%c0], %f0 {permutation_map = affine_map<(d0)->()>, in_bounds = [true]} :
       tensor<?xf32>, vector<f32>
-    %1 = vector.transfer_write %0, %arg0[%c0] {permutation_map = affine_map<(d0)->()>} :
+    %1 = vector.transfer_write %0, %arg0[%c0] {permutation_map = affine_map<(d0)->()>, in_bounds = [true]} :
       vector<f32>, tensor<?xf32>
-    %2 = vector.transfer_read %arg1[%c0, %c0], %f0 {permutation_map = affine_map<(d0, d1)->()>} :
+    %2 = vector.transfer_read %arg1[%c0, %c0], %f0 {permutation_map = affine_map<(d0, d1)->()>, in_bounds = [true, true]} :
       memref<?x?xf32>, vector<f32>
-    vector.transfer_write %2, %arg1[%c0, %c0] {permutation_map = affine_map<(d0, d1)->()>} :
+    vector.transfer_write %2, %arg1[%c0, %c0] {permutation_map = affine_map<(d0, d1)->()>, in_bounds = [true, true]} :
       vector<f32>, memref<?x?xf32>
     return %1: tensor<?xf32>
 }
@@ -52,40 +52,40 @@
   %m2 = vector.splat %i1 : vector<4x5xi1>
   //
   // CHECK: vector.transfer_read
-  %0 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d0)>} : memref<?x?xf32>, vector<128xf32>
+  %0 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : memref<?x?xf32>, vector<128xf32>
   // CHECK: vector.transfer_read
   %1 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d1, d0)>} : memref<?x?xf32>, vector<3x7xf32>
   // CHECK: vector.transfer_read
-  %2 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0)>} : memref<?x?xf32>,  vector<128xf32>
+  %2 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : memref<?x?xf32>,  vector<128xf32>
   // CHECK: vector.transfer_read
-  %3 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d1)>} : memref<?x?xf32>,  vector<128xf32>
+  %3 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d1)>, in_bounds = [true, false]} : memref<?x?xf32>,  vector<128xf32>
   // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
   %4 = vector.transfer_read %arg1[%c3, %c3], %vf0 {permutation_map = affine_map<(d0, d1)->(d0, d1)>} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
   // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} {in_bounds = [false, true]} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
   %5 = vector.transfer_read %arg1[%c3, %c3], %vf0 {in_bounds = [false, true]} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
   // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : memref<?x?xvector<4x3xi32>>, vector<5x24xi8>
-  %6 = vector.transfer_read %arg2[%c3, %c3], %v0 : memref<?x?xvector<4x3xi32>>, vector<5x24xi8>
+  %6 = vector.transfer_read %arg2[%c3, %c3], %v0 {in_bounds = [true, true]} : memref<?x?xvector<4x3xi32>>, vector<5x24xi8>
   // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : memref<?x?xvector<4x3xindex>>, vector<5x48xi8>
-  %7 = vector.transfer_read %arg3[%c3, %c3], %vi0 : memref<?x?xvector<4x3xindex>>, vector<5x48xi8>
+  %7 = vector.transfer_read %arg3[%c3, %c3], %vi0 {in_bounds = [true, true]} : memref<?x?xvector<4x3xindex>>, vector<5x48xi8>
   // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}}, %{{.*}} : memref<?x?xf32>, vector<5xf32>
-  %8 = vector.transfer_read %arg0[%c3, %c3], %f0, %m : memref<?x?xf32>, vector<5xf32>
+  %8 = vector.transfer_read %arg0[%c3, %c3], %f0, %m {in_bounds = [true, false]} : memref<?x?xf32>, vector<5xf32>
   // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]], %[[C3]]], %{{.*}}, %{{.*}} : memref<?x?x?xf32>, vector<5x4x8xf32>
-  %9 = vector.transfer_read %arg4[%c3, %c3, %c3], %f0, %m2 {permutation_map = affine_map<(d0, d1, d2)->(d1, d0, 0)>} : memref<?x?x?xf32>, vector<5x4x8xf32>
+  %9 = vector.transfer_read %arg4[%c3, %c3, %c3], %f0, %m2 {permutation_map = affine_map<(d0, d1, d2)->(d1, d0, 0)>, in_bounds = [false, false, true]} : memref<?x?x?xf32>, vector<5x4x8xf32>
 
   // CHECK: vector.transfer_write
-  vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0)>} : vector<128xf32>, memref<?x?xf32>
+  vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : vector<128xf32>, memref<?x?xf32>
   // CHECK: vector.transfer_write
   vector.transfer_write %1, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d1, d0)>} : vector<3x7xf32>, memref<?x?xf32>
   // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
   vector.transfer_write %4, %arg1[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0, d1)>} : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
   // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
   vector.transfer_write %5, %arg1[%c3, %c3] {in_bounds = [false, false]} : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
-  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<5x24xi8>, memref<?x?xvector<4x3xi32>>
-  vector.transfer_write %6, %arg2[%c3, %c3] : vector<5x24xi8>, memref<?x?xvector<4x3xi32>>
-  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<5x48xi8>, memref<?x?xvector<4x3xindex>>
-  vector.transfer_write %7, %arg3[%c3, %c3] : vector<5x48xi8>, memref<?x?xvector<4x3xindex>>
+  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] {in_bounds = [true, true]} : vector<5x24xi8>, memref<?x?xvector<4x3xi32>>
+  vector.transfer_write %6, %arg2[%c3, %c3] {in_bounds = [true, true]} : vector<5x24xi8>, memref<?x?xvector<4x3xi32>>
+  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] {in_bounds = [true, true]} : vector<5x48xi8>, memref<?x?xvector<4x3xindex>>
+  vector.transfer_write %7, %arg3[%c3, %c3] {in_bounds = [true, true]} : vector<5x48xi8>, memref<?x?xvector<4x3xindex>>
   // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : vector<5xf32>, memref<?x?xf32>
-  vector.transfer_write %8, %arg0[%c3, %c3], %m : vector<5xf32>, memref<?x?xf32>
+  vector.transfer_write %8, %arg0[%c3, %c3], %m {in_bounds = [true, false]} : vector<5xf32>, memref<?x?xf32>
 
   return
 }
@@ -112,35 +112,35 @@
 
   //
   // CHECK: vector.transfer_read
-  %0 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d0)>} : tensor<?x?xf32>, vector<128xf32>
+  %0 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : tensor<?x?xf32>, vector<128xf32>
   // CHECK: vector.transfer_read
-  %1 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d1, d0)>} : tensor<?x?xf32>, vector<3x7xf32>
+  %1 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d1, d0)>, in_bounds = [true, true]} : tensor<?x?xf32>, vector<3x7xf32>
   // CHECK: vector.transfer_read
-  %2 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0)>} : tensor<?x?xf32>,  vector<128xf32>
+  %2 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : tensor<?x?xf32>,  vector<128xf32>
   // CHECK: vector.transfer_read
-  %3 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d1)>} : tensor<?x?xf32>,  vector<128xf32>
+  %3 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d1)>, in_bounds = [true, false]} : tensor<?x?xf32>,  vector<128xf32>
   // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : tensor<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
   %4 = vector.transfer_read %arg1[%c3, %c3], %vf0 {permutation_map = affine_map<(d0, d1)->(d0, d1)>} : tensor<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
   // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} {in_bounds = [false, true]} : tensor<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
   %5 = vector.transfer_read %arg1[%c3, %c3], %vf0 {in_bounds = [false, true]} : tensor<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
   // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : tensor<?x?xvector<4x3xi32>>, vector<5x24xi8>
-  %6 = vector.transfer_read %arg2[%c3, %c3], %v0 : tensor<?x?xvector<4x3xi32>>, vector<5x24xi8>
+  %6 = vector.transfer_read %arg2[%c3, %c3], %v0 {in_bounds = [true, true]} : tensor<?x?xvector<4x3xi32>>, vector<5x24xi8>
   // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : tensor<?x?xvector<4x3xindex>>, vector<5x48xi8>
-  %7 = vector.transfer_read %arg3[%c3, %c3], %vi0 : tensor<?x?xvector<4x3xindex>>, vector<5x48xi8>
+  %7 = vector.transfer_read %arg3[%c3, %c3], %vi0 {in_bounds = [true, true]} : tensor<?x?xvector<4x3xindex>>, vector<5x48xi8>
 
 
   // CHECK: vector.transfer_write
-  %8 = vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0)>} : vector<128xf32>, tensor<?x?xf32>
+  %8 = vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : vector<128xf32>, tensor<?x?xf32>
   // CHECK: vector.transfer_write
   %9 = vector.transfer_write %1, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d1, d0)>} : vector<3x7xf32>, tensor<?x?xf32>
   // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<1x1x4x3xf32>, tensor<?x?xvector<4x3xf32>>
   %10 = vector.transfer_write %4, %arg1[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0, d1)>} : vector<1x1x4x3xf32>, tensor<?x?xvector<4x3xf32>>
   // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<1x1x4x3xf32>, tensor<?x?xvector<4x3xf32>>
   %11 = vector.transfer_write %5, %arg1[%c3, %c3] {in_bounds = [false, false]} : vector<1x1x4x3xf32>, tensor<?x?xvector<4x3xf32>>
-  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<5x24xi8>, tensor<?x?xvector<4x3xi32>>
-  %12 = vector.transfer_write %6, %arg2[%c3, %c3] : vector<5x24xi8>, tensor<?x?xvector<4x3xi32>>
-  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<5x48xi8>, tensor<?x?xvector<4x3xindex>>
-  %13 = vector.transfer_write %7, %arg3[%c3, %c3] : vector<5x48xi8>, tensor<?x?xvector<4x3xindex>>
+  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] {in_bounds = [true, true]} : vector<5x24xi8>, tensor<?x?xvector<4x3xi32>>
+  %12 = vector.transfer_write %6, %arg2[%c3, %c3] {in_bounds = [true, true]} : vector<5x24xi8>, tensor<?x?xvector<4x3xi32>>
+  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] {in_bounds = [true, true]} : vector<5x48xi8>, tensor<?x?xvector<4x3xindex>>
+  %13 = vector.transfer_write %7, %arg3[%c3, %c3] {in_bounds = [true, true]} : vector<5x48xi8>, tensor<?x?xvector<4x3xindex>>
 
   return %8, %9, %10, %11, %12, %13 :
     tensor<?x?xf32>, tensor<?x?xf32>,  tensor<?x?xvector<4x3xf32>>,
diff --git a/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir b/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
--- a/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
+++ b/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
@@ -7,7 +7,7 @@
 //       CHECK:   return %[[r]]
 func.func @transfer_read_0d(%m: memref<?x?x?xf32>, %idx: index) -> f32 {
   %cst = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %m[%idx, %idx, %idx], %cst : memref<?x?x?xf32>, vector<f32>
+  %0 = vector.transfer_read %m[%idx, %idx, %idx], %cst {in_bounds = [true, true, true]} : memref<?x?x?xf32>, vector<f32>
   %1 = vector.extractelement %0[] : vector<f32>
   return %1 : f32
 }
@@ -23,7 +23,7 @@
 func.func @transfer_read_1d(%m: memref<?x?x?xf32>, %idx: index, %idx2: index) -> f32 {
   %cst = arith.constant 0.0 : f32
   %c0 = arith.constant 0 : index
-  %0 = vector.transfer_read %m[%idx, %idx, %idx], %cst {in_bounds = [true]} : memref<?x?x?xf32>, vector<5xf32>
+  %0 = vector.transfer_read %m[%idx, %idx, %idx], %cst {in_bounds = [true, true, true]} : memref<?x?x?xf32>, vector<5xf32>
   %1 = vector.extractelement %0[%idx2 : index] : vector<5xf32>
   return %1 : f32
 }
@@ -36,7 +36,7 @@
 //       CHECK:   return %[[r]]
 func.func @tensor_transfer_read_0d(%t: tensor<?x?x?xf32>, %idx: index) -> f32 {
   %cst = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %t[%idx, %idx, %idx], %cst : tensor<?x?x?xf32>, vector<f32>
+  %0 = vector.transfer_read %t[%idx, %idx, %idx], %cst {in_bounds = [true, true, true]} : tensor<?x?x?xf32>, vector<f32>
   %1 = vector.extractelement %0[] : vector<f32>
   return %1 : f32
 }
@@ -50,7 +50,7 @@
 //       CHECK:   memref.store %[[extract]], %[[m]][%[[idx]], %[[idx]], %[[idx]]]
 func.func @transfer_write_0d(%m: memref<?x?x?xf32>, %idx: index, %f: f32) {
   %0 = vector.broadcast %f : f32 to vector<f32>
-  vector.transfer_write %0, %m[%idx, %idx, %idx] : vector<f32>, memref<?x?x?xf32>
+  vector.transfer_write %0, %m[%idx, %idx, %idx] {in_bounds =  [true, true, true]} : vector<f32>, memref<?x?x?xf32>
   return
 }
 
@@ -61,7 +61,7 @@
 //       CHECK:   memref.store %[[f]], %[[m]][%[[idx]], %[[idx]], %[[idx]]]
 func.func @transfer_write_1d(%m: memref<?x?x?xf32>, %idx: index, %f: f32) {
   %0 = vector.broadcast %f : f32 to vector<1xf32>
-  vector.transfer_write %0, %m[%idx, %idx, %idx] : vector<1xf32>, memref<?x?x?xf32>
+  vector.transfer_write %0, %m[%idx, %idx, %idx] {in_bounds =  [true, true, true]} : vector<1xf32>, memref<?x?x?xf32>
   return
 }
 
@@ -75,7 +75,7 @@
 //       CHECK:   return %[[r]]
 func.func @tensor_transfer_write_0d(%t: tensor<?x?x?xf32>, %idx: index, %f: f32) -> tensor<?x?x?xf32> {
   %0 = vector.broadcast %f : f32 to vector<f32>
-  %1 = vector.transfer_write %0, %t[%idx, %idx, %idx] : vector<f32>, tensor<?x?x?xf32>
+  %1 = vector.transfer_write %0, %t[%idx, %idx, %idx] {in_bounds =  [true, true, true]} : vector<f32>, tensor<?x?x?xf32>
   return %1 : tensor<?x?x?xf32>
 }
 
@@ -92,7 +92,7 @@
 func.func @transfer_read_2d_extract(%m: memref<?x?x?x?xf32>, %idx: index, %idx2: index) -> f32 {
   %cst = arith.constant 0.0 : f32
   %c0 = arith.constant 0 : index
-  %0 = vector.transfer_read %m[%idx, %idx, %idx, %idx], %cst {in_bounds = [true, true]} : memref<?x?x?x?xf32>, vector<10x5xf32>
+  %0 = vector.transfer_read %m[%idx, %idx, %idx, %idx], %cst {in_bounds = [true, true, true, true]} : memref<?x?x?x?xf32>, vector<10x5xf32>
   %1 = vector.extract %0[8, 1] : vector<10x5xf32>
   return %1 : f32
 }
@@ -106,7 +106,7 @@
 //       CHECK:   memref.store %[[extract]], %[[m]][%[[idx]], %[[idx]], %[[idx]]]
 func.func @transfer_write_arith_constant(%m: memref<?x?x?xf32>, %idx: index) {
   %cst = arith.constant dense<5.000000e+00> : vector<1x1xf32>
-  vector.transfer_write %cst, %m[%idx, %idx, %idx] : vector<1x1xf32>, memref<?x?x?xf32>
+  vector.transfer_write %cst, %m[%idx, %idx, %idx] {in_bounds =  [true, true, true]} : vector<1x1xf32>, memref<?x?x?xf32>
   return
 }
 
diff --git a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
--- a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
@@ -202,9 +202,9 @@
   %c0 = arith.constant 0 : index
   // CHECK: %[[F0:.+]] = arith.constant 0.000000e+00 : f16
   %f0 = arith.constant 0. : f16
-  // CHECK: %[[READ:.+]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]], %[[C0]]], %[[F0]] {in_bounds = [true]} : memref<1x4x8x16xf16>, vector<4xf16>
+  // CHECK: %[[READ:.+]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]], %[[C0]]], %[[F0]] {in_bounds = [true, true, true, true]} : memref<1x4x8x16xf16>, vector<4xf16>
   // CHECK: %[[CAST:.+]] = vector.broadcast %[[READ]] : vector<4xf16> to vector<1x4xf16>
-  %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %f0 {in_bounds = [true, true]} : memref<1x4x8x16xf16>, vector<1x4xf16>
+  %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %f0 {in_bounds = [true, true, true, true]} : memref<1x4x8x16xf16>, vector<1x4xf16>
   // CHECK: return %[[CAST]]
   return %0: vector<1x4xf16>
 }
@@ -214,7 +214,7 @@
   %c0 = arith.constant 0 : index
   %f0 = arith.constant 0. : f16
   // CHECK: vector.broadcast %{{.+}} : vector<1xf16> to vector<1x1xf16>
-  %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %f0 {in_bounds = [true, true]} : memref<1x1x1x1xf16>, vector<1x1xf16>
+  %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %f0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16>, vector<1x1xf16>
   return %0: vector<1x1xf16>
 }
 
@@ -223,9 +223,9 @@
   // CHECK: %[[C0:.+]] = arith.constant 0 : index
   %c0 = arith.constant 0 : index
   // CHECK: %[[CAST:.+]] = vector.extract %{{.*}}[0] : vector<1x4xf16>
-  // CHECK: vector.transfer_write %[[CAST]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true]} : vector<4xf16>, memref<1x4x8x16xf16>
+  // CHECK: vector.transfer_write %[[CAST]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true, true]} : vector<4xf16>, memref<1x4x8x16xf16>
 
-  vector.transfer_write %arg1, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf16>, memref<1x4x8x16xf16>
+  vector.transfer_write %arg1, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x4xf16>, memref<1x4x8x16xf16>
   return
 }
 
@@ -233,7 +233,7 @@
 func.func @cast_away_transfer_write_leading_one_dims_one_element(%arg0: memref<1x1x1x1xf16>, %arg1: vector<1x1xf16>) {
   %c0 = arith.constant 0 : index
   // CHECK: vector.extract %{{.+}}[0] : vector<1x1xf16>
-  vector.transfer_write %arg1, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x1xf16>, memref<1x1x1x1xf16>
+  vector.transfer_write %arg1, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1xf16>, memref<1x1x1x1xf16>
   return
 }
 
diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
@@ -3,7 +3,7 @@
 func.func @contiguous_inner_most_view(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32>
+  %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32>
   return %0 : vector<1x8x1xf32>
 }
 //      CHECK: func @contiguous_inner_most_view(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>
diff --git a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
@@ -4,7 +4,7 @@
       %arg : memref<1x1x3x2xi8, strided<[6, 6, 2, 1], offset: ?>>) -> vector<3x2xi8> {
     %c0 = arith.constant 0 : index
     %cst = arith.constant 0 : i8
-    %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0], %cst :
+    %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, false, false]} :
       memref<1x1x3x2xi8, strided<[6, 6, 2, 1], offset: ?>>, vector<3x2xi8>
     return %v : vector<3x2xi8>
 }
@@ -16,7 +16,7 @@
 
 func.func @transfer_write_rank_reducing(%arg : memref<1x1x3x2xi8, strided<[6, 6, 2, 1], offset: ?>>, %vec : vector<3x2xi8>) {
     %c0 = arith.constant 0 : index
-    vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0] :
+    vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0] {in_bounds = [true, true, false, false]} :
       vector<3x2xi8>, memref<1x1x3x2xi8, strided<[6, 6, 2, 1], offset: ?>>
     return
 }
@@ -30,7 +30,7 @@
       %arg : memref<1x1x3x2x1xf32>) -> vector<3x2x1xf32> {
     %c0 = arith.constant 0 : index
     %cst = arith.constant 0.0 : f32
-    %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0, %c0], %cst :
+    %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, false, false, false]} :
       memref<1x1x3x2x1xf32>, vector<3x2x1xf32>
     return %v : vector<3x2x1xf32>
 }
@@ -44,7 +44,7 @@
       %arg : memref<1x1x3x2x1xf32>,
       %vec : vector<3x2x1xf32>) {
     %c0 = arith.constant 0 : index
-    vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0, %c0] :
+    vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, false, false, false]} :
       vector<3x2x1xf32>, memref<1x1x3x2x1xf32>
     return
 }
@@ -58,7 +58,7 @@
       %arg : memref<1x1x1x1x1xf32>) -> vector<1x1x1xf32> {
     %c0 = arith.constant 0 : index
     %cst = arith.constant 0.0 : f32
-    %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0, %c0], %cst :
+    %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, false, false, false]} :
       memref<1x1x1x1x1xf32>, vector<1x1x1xf32>
     return %v : vector<1x1x1xf32>
 }
@@ -72,7 +72,7 @@
       %arg : memref<1x1x1x1x1xf32>,
       %vec : vector<1x1x1xf32>) {
     %c0 = arith.constant 0 : index
-    vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0, %c0] :
+    vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, false, false, false]} :
       vector<1x1x1xf32>, memref<1x1x1x1x1xf32>
     return
 }
diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -65,7 +65,7 @@
 func.func @transfer_read_flattenable_with_dynamic_dims_and_indices(%arg0 : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, %arg1 : index, %arg2 : index) -> vector<8x4xi8> {
     %c0_i8 = arith.constant 0 : i8
     %c0 = arith.constant 0 : index
-    %result = vector.transfer_read %arg0[%arg1, %arg2, %c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, vector<8x4xi8>
+    %result = vector.transfer_read %arg0[%arg1, %arg2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true]} : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, vector<8x4xi8>
     return %result : vector<8x4xi8>
 }
 
@@ -77,7 +77,7 @@
 // CHECK-SAME:    : memref<?x?x8x4xi8, {{.+}}> into memref<?x?x32xi8, {{.+}}>
 // CHECK:       %[[VEC1D:.+]] = vector.transfer_read %[[COLLAPSED]]
 // CHECK-SAME:    [%[[ARG1]], %[[ARG2]], %[[C0]]], %[[C0_I8]]
-// CHECK-SAME:    {in_bounds = [true]}
+// CHECK-SAME:    {in_bounds = [true, true, true]}
 // CHECK-SAME:    : memref<?x?x32xi8, {{.+}}>, vector<32xi8>
 // CHECK:       %[[VEC2D:.+]] = vector.shape_cast %[[VEC1D]] : vector<32xi8> to vector<8x4xi8>
 // CHECK:       return %[[VEC2D]] : vector<8x4xi8>
@@ -86,7 +86,7 @@
 
 func.func @transfer_write_flattenable_with_dynamic_dims_and_indices(%vec : vector<8x4xi8>, %dst : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, %arg1 : index, %arg2 : index) {
     %c0 = arith.constant 0 : index
-    vector.transfer_write %vec, %dst[%arg1, %arg2, %c0, %c0] {in_bounds = [true, true]} : vector<8x4xi8>, memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>
+    vector.transfer_write %vec, %dst[%arg1, %arg2, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<8x4xi8>, memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>
     return
 }
 
@@ -98,7 +98,7 @@
 // CHECK:       %[[VEC1D:.+]] = vector.shape_cast %[[ARG0]] : vector<8x4xi8> to vector<32xi8>
 // CHECK:       vector.transfer_write %[[VEC1D]], %[[COLLAPSED]]
 // CHECK-SAME:    [%[[ARG2]], %[[ARG3]], %[[C0]]]
-// CHECK-SAME:    {in_bounds = [true]}
+// CHECK-SAME:    {in_bounds = [true, true, true]}
 // CHECK-SAME:    : vector<32xi8>, memref<?x?x32xi8, {{.+}}>
 
 // -----
diff --git a/mlir/test/Dialect/Vector/vector-transfer-materialize-masks.mlir b/mlir/test/Dialect/Vector/vector-transfer-materialize-masks.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Vector/vector-transfer-materialize-masks.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -split-input-file | FileCheck %s
+
+transform.sequence failures(propagate) {
+^bb1(%func_op: !transform.op<"func.func">):
+  transform.apply_patterns to %func_op {
+    transform.apply_patterns.vector.materialize_masks
+  } : !transform.op<"func.func">
+}
+
+// CHECK-LABEL: func @mask_1d_transfer(
+//       CHECK:   arith.cmpi
+//       CHECK:   vector.transfer_read %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} {in_bounds = [true, true]}
+func.func @mask_1d_transfer(%t: tensor<?x?xf32>, %i: index, %j: index)
+    -> vector<5xf32>
+{
+  %cst = arith.constant 5.5 : f32
+  %0 = vector.transfer_read %t[%i, %j], %cst {in_bounds = [true, false]}
+      : tensor<?x?xf32>, vector<5xf32>
+  return %0 : vector<5xf32>
+}
+
+// CHECK-LABEL: func @mask_2d_transfer(
+//   CHECK-NOT:   arith.cmpi
+//       CHECK:   vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true, false]}
+func.func @mask_2d_transfer(%t: tensor<?x?xf32>, %i: index, %j: index)
+    -> vector<5x4xf32>
+{
+  %cst = arith.constant 5.5 : f32
+  // Masks are currently not materialized for transfers that are 2D or higher.
+  %0 = vector.transfer_read %t[%i, %j], %cst {in_bounds = [true, false]}
+      : tensor<?x?xf32>, vector<5x4xf32>
+  return %0 : vector<5x4xf32>
+}
diff --git a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
@@ -12,7 +12,7 @@
   %vf0 = vector.splat %fn1 : vector<7xf32>
   %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1]> : vector<7xi1>
   vector.transfer_write %vf0, %A[%base1, %base2], %mask
-    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [false]}
+    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [false, true]}
     : vector<7xf32>, memref<?x?xf32>
   return
 }
diff --git a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
@@ -46,8 +46,8 @@
 
 func.func @transfer_to_load(%mem : memref<8x8xf32>, %i : index) -> vector<4xf32> {
   %cf0 = arith.constant 0.0 : f32
-  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true]} : memref<8x8xf32>, vector<4xf32>
-  vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true]} : vector<4xf32>, memref<8x8xf32>
+  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true]} : memref<8x8xf32>, vector<4xf32>
+  vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true, true]} : vector<4xf32>, memref<8x8xf32>
   return %res : vector<4xf32>
 }
 
@@ -78,8 +78,8 @@
 
 func.func @transfer_vector_element(%mem : memref<8x8xvector<2x4xf32>>, %i : index) -> vector<2x4xf32> {
   %cf0 = arith.constant dense<0.0> : vector<2x4xf32>
-  %res = vector.transfer_read %mem[%i, %i], %cf0 : memref<8x8xvector<2x4xf32>>, vector<2x4xf32>
-  vector.transfer_write %res, %mem[%i, %i] : vector<2x4xf32>, memref<8x8xvector<2x4xf32>>
+  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true]}: memref<8x8xvector<2x4xf32>>, vector<2x4xf32>
+  vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true, true]} : vector<2x4xf32>, memref<8x8xvector<2x4xf32>>
   return %res : vector<2x4xf32>
 }
 
@@ -89,15 +89,15 @@
 // CHECK-SAME:                           %[[MEM:.*]]: memref<8x8xvector<2x4xf32>>,
 // CHECK-SAME:                           %[[IDX:.*]]: index) -> vector<1x2x4xf32> {
 // CHECK-NEXT:      %[[CF0:.*]] = arith.constant dense<0.000000e+00> : vector<2x4xf32>
-// CHECK-NEXT:      %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] {in_bounds = [true]} : memref<8x8xvector<2x4xf32>>, vector<1x2x4xf32>
-// CHECK-NEXT:      vector.transfer_write %[[RES:.*]], %[[MEM]][%[[IDX]], %[[IDX]]] {in_bounds = [true]} : vector<1x2x4xf32>, memref<8x8xvector<2x4xf32>>
+// CHECK-NEXT:      %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] {in_bounds = [true, true]} : memref<8x8xvector<2x4xf32>>, vector<1x2x4xf32>
+// CHECK-NEXT:      vector.transfer_write %[[RES:.*]], %[[MEM]][%[[IDX]], %[[IDX]]] {in_bounds = [true, true]} : vector<1x2x4xf32>, memref<8x8xvector<2x4xf32>>
 // CHECK-NEXT:      return %[[RES]] : vector<1x2x4xf32>
 // CHECK-NEXT:    }
 
 func.func @transfer_vector_element_different_types(%mem : memref<8x8xvector<2x4xf32>>, %i : index) -> vector<1x2x4xf32> {
   %cf0 = arith.constant dense<0.0> : vector<2x4xf32>
-  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true]} : memref<8x8xvector<2x4xf32>>, vector<1x2x4xf32>
-  vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true]} : vector<1x2x4xf32>, memref<8x8xvector<2x4xf32>>
+  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true]} : memref<8x8xvector<2x4xf32>>, vector<1x2x4xf32>
+  vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true, true]} : vector<1x2x4xf32>, memref<8x8xvector<2x4xf32>>
   return %res : vector<1x2x4xf32>
 }
 
@@ -125,15 +125,15 @@
 // CHECK-SAME:                               %[[MEM:.*]]: memref<8x8xf32>,
 // CHECK-SAME:                               %[[IDX:.*]]: index) -> vector<4xf32> {
 // CHECK-NEXT:      %[[CF0:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-NEXT:      %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] : memref<8x8xf32>, vector<4xf32>
-// CHECK-NEXT:      vector.transfer_write %[[RES]], %[[MEM]][%[[IDX]], %[[IDX]]] : vector<4xf32>, memref<8x8xf32>
+// CHECK-NEXT:      %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] {in_bounds = [true, false]} : memref<8x8xf32>, vector<4xf32>
+// CHECK-NEXT:      vector.transfer_write %[[RES]], %[[MEM]][%[[IDX]], %[[IDX]]] {in_bounds = [true, false]} : vector<4xf32>, memref<8x8xf32>
 // CHECK-NEXT:      return %[[RES]] : vector<4xf32>
 // CHECK-NEXT:    }
 
 func.func @transfer_not_inbounds(%mem : memref<8x8xf32>, %i : index) -> vector<4xf32> {
   %cf0 = arith.constant 0.0 : f32
-  %res = vector.transfer_read %mem[%i, %i], %cf0 : memref<8x8xf32>, vector<4xf32>
-  vector.transfer_write %res, %mem[%i, %i] : vector<4xf32>, memref<8x8xf32>
+  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, false]} : memref<8x8xf32>, vector<4xf32>
+  vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true, false]} : vector<4xf32>, memref<8x8xf32>
   return %res : vector<4xf32>
 }
 
@@ -148,8 +148,8 @@
 #layout = affine_map<(d0, d1) -> (d0*16 + d1)>
 func.func @transfer_nondefault_layout(%mem : memref<8x8xf32, #layout>, %i : index) -> vector<4xf32> {
   %cf0 = arith.constant 0.0 : f32
-  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true]} : memref<8x8xf32, #layout>, vector<4xf32>
-  vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true]} : vector<4xf32>, memref<8x8xf32, #layout>
+  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true]} : memref<8x8xf32, #layout>, vector<4xf32>
+  vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true, true]} : vector<4xf32>, memref<8x8xf32, #layout>
   return %res : vector<4xf32>
 }
 
@@ -159,13 +159,13 @@
 // CHECK-SAME:                                 %[[MEM:.*]]: memref<8x8xf32>,
 // CHECK-SAME:                                 %[[IDX:.*]]: index) -> vector<4xf32> {
 // CHECK-NEXT:      %[[CF0:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-NEXT:      %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] {in_bounds = [true], permutation_map = #{{.*}}} : memref<8x8xf32>, vector<4xf32>
+// CHECK-NEXT:      %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] {in_bounds = [true, true], permutation_map = #{{.*}}} : memref<8x8xf32>, vector<4xf32>
 // CHECK-NEXT:      return %[[RES]] : vector<4xf32>
 // CHECK-NEXT:    }
 
 func.func @transfer_perm_map(%mem : memref<8x8xf32>, %i : index) -> vector<4xf32> {
   %cf0 = arith.constant 0.0 : f32
-  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<8x8xf32>, vector<4xf32>
+  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<8x8xf32>, vector<4xf32>
   return %res : vector<4xf32>
 }
 
@@ -183,7 +183,7 @@
 func.func @transfer_broadcasting(%mem : memref<8x8xf32>, %i : index) -> vector<4xf32> {
   %cf0 = arith.constant 0.0 : f32
   %res = vector.transfer_read %mem[%i, %i], %cf0 
-    {in_bounds = [true], permutation_map = #broadcast_1d} 
+    {in_bounds = [true, true], permutation_map = #broadcast_1d} 
       : memref<8x8xf32>, vector<4xf32>
   return %res : vector<4xf32>
 }
@@ -197,7 +197,7 @@
 // CHECK-NEXT:    }
 func.func @transfer_scalar(%mem : memref<?x?xf32>, %i : index) -> vector<1xf32> {
   %cf0 = arith.constant 0.0 : f32
-  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true]} : memref<?x?xf32>, vector<1xf32>
+  %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true]} : memref<?x?xf32>, vector<1xf32>
   return %res : vector<1xf32>
 }
 
@@ -232,7 +232,7 @@
 func.func @transfer_broadcasting_complex(%mem : memref<10x20x30x8x8xf32>, %i : index) -> vector<3x2x4x5xf32> {
   %cf0 = arith.constant 0.0 : f32
   %res = vector.transfer_read %mem[%i, %i, %i, %i, %i], %cf0 
-    {in_bounds = [true, true, true, true], permutation_map = #broadcast_2d_in_4d} 
+    {in_bounds = [true, true, true, true, true], permutation_map = #broadcast_2d_in_4d} 
       : memref<10x20x30x8x8xf32>, vector<3x2x4x5xf32>
   return %res : vector<3x2x4x5xf32>
 }
@@ -271,19 +271,19 @@
 // CHECK: %[[MASK0:.*]] = vector.splat %{{.*}} : vector<14x7xi1>
   %mask0 = vector.splat %m : vector<14x7xi1>
   %0 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst, %mask0 {in_bounds = [true, false, true, true], permutation_map = #map0} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32>
-// CHECK: vector.transfer_read {{.*}} %[[MASK0]] {in_bounds = [false, true, true, true], permutation_map = #[[$MAP0]]} : memref<?x?x?x?xf32>, vector<14x7x8x16xf32>
+// CHECK: vector.transfer_read {{.*}} %[[MASK0]] {in_bounds = [true, false, true, true], permutation_map = #[[$MAP0]]} : memref<?x?x?x?xf32>, vector<14x7x8x16xf32>
 // CHECK: vector.transpose %{{.*}}, [1, 0, 2, 3] : vector<14x7x8x16xf32> to vector<7x14x8x16xf32>
 
 // CHECK: %[[MASK1:.*]] = vector.splat %{{.*}} : vector<16x14xi1>
   %mask1 = vector.splat %m : vector<16x14xi1>
-  %1 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst, %mask1 {permutation_map = #map1} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32>
-// CHECK: vector.transfer_read {{.*}} %[[MASK1]] {permutation_map = #[[$MAP0]]} : memref<?x?x?x?xf32>, vector<16x14x7x8xf32>
+  %1 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst, %mask1 {permutation_map = #map1, in_bounds = [false, false, true, true]} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32>
+// CHECK: vector.transfer_read {{.*}} %[[MASK1]] {in_bounds = [false, false, true, true], permutation_map = #[[$MAP0]]} : memref<?x?x?x?xf32>, vector<16x14x7x8xf32>
 // CHECK: vector.transpose %{{.*}}, [2, 1, 3, 0] : vector<16x14x7x8xf32> to vector<7x14x8x16xf32>
 
 // CHECK: %[[MASK3:.*]] = vector.splat %{{.*}} : vector<14x7xi1>
   %mask2 = vector.splat %m : vector<14x7xi1>
   %2 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst, %mask2 {in_bounds = [true, false, true, true], permutation_map = #map2} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32>
-// CHECK: vector.transfer_read {{.*}} %[[MASK3]] {in_bounds = [false, true, true], permutation_map = #[[$MAP1]]} : memref<?x?x?x?xf32>, vector<14x16x7xf32>
+// CHECK: vector.transfer_read {{.*}} %[[MASK3]] {in_bounds = [true, false, true, true], permutation_map = #[[$MAP1]]} : memref<?x?x?x?xf32>, vector<14x16x7xf32>
 // CHECK: vector.broadcast %{{.*}} : vector<14x16x7xf32> to vector<8x14x16x7xf32>
 // CHECK: vector.transpose %{{.*}}, [3, 1, 0, 2] : vector<8x14x16x7xf32> to vector<7x14x8x16xf32>
 
@@ -301,7 +301,7 @@
 // CHECK: vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]], %[[C0]]], %[[CF0]] : memref<?x?x?x?xf32>, vector<16x14x7x8xf32>
 // CHECK: vector.transpose %{{.*}}, [2, 1, 3, 0] : vector<16x14x7x8xf32> to vector<7x14x8x16xf32>
 
-  %6 = vector.transfer_read %arg0[%c0, %c0], %cst {permutation_map = #map6} : memref<?x?xf32>, vector<8xf32>
+  %6 = vector.transfer_read %arg0[%c0, %c0], %cst {permutation_map = #map6, in_bounds = [true, true]} : memref<?x?xf32>, vector<8xf32>
 // CHECK: memref.load %{{.*}}[%[[C0]], %[[C0]]] : memref<?x?xf32>
 // CHECK: vector.broadcast %{{.*}} : f32 to vector<8xf32>
 
@@ -326,11 +326,11 @@
   %mask0 = vector.splat %m : vector<16x14x7x8xi1>
   %0 = vector.transfer_write %v1, %arg1[%c0, %c0, %c0, %c0], %mask0 {in_bounds = [true, false, false, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d1, d3, d0)>} : vector<7x14x8x16xf32>, tensor<?x?x?x?xf32>
   // CHECK: %[[NEW_VEC0:.*]] = vector.transpose %{{.*}} [3, 1, 0, 2] : vector<7x14x8x16xf32> to vector<16x14x7x8xf32>
-  // CHECK: %[[NEW_RES0:.*]] = vector.transfer_write %[[NEW_VEC0]], %[[ARG1]][%c0, %c0, %c0, %c0], %[[MASK]] {in_bounds = [true, false, true, false]} : vector<16x14x7x8xf32>, tensor<?x?x?x?xf32>
+  // CHECK: %[[NEW_RES0:.*]] = vector.transfer_write %[[NEW_VEC0]], %[[ARG1]][%c0, %c0, %c0, %c0], %[[MASK]] {in_bounds = [true, false, false, true]} : vector<16x14x7x8xf32>, tensor<?x?x?x?xf32>
 
-  vector.transfer_write %v2, %arg0[%c0, %c0, %c0, %c0] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>} : vector<8x16xf32>, memref<?x?x?x?xf32>
+  vector.transfer_write %v2, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [true, true, false, false], permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>} : vector<8x16xf32>, memref<?x?x?x?xf32>
   // CHECK: %[[NEW_VEC1:.*]] = vector.transpose %{{.*}} [1, 0] : vector<8x16xf32> to vector<16x8xf32>
-  // CHECK: vector.transfer_write %[[NEW_VEC1]], %[[ARG0]][%c0, %c0, %c0, %c0] : vector<16x8xf32>, memref<?x?x?x?xf32>
+  // CHECK: vector.transfer_write %[[NEW_VEC1]], %[[ARG0]][%c0, %c0, %c0, %c0] {in_bounds = [true, true, false, false]} : vector<16x8xf32>, memref<?x?x?x?xf32>
 
   return %0 : tensor<?x?x?x?xf32>
 }
@@ -347,15 +347,15 @@
   // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
   %c0 = arith.constant 0 : index
 
-  %0 = vector.transfer_write %v1, %arg1[%c0, %c0, %c0, %c0] {in_bounds = [false, false, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>} : vector<14x8x16xf32>, tensor<?x?x?x?xf32>
+  %0 = vector.transfer_write %v1, %arg1[%c0, %c0, %c0, %c0] {in_bounds = [false, false, true, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>} : vector<14x8x16xf32>, tensor<?x?x?x?xf32>
   // CHECK: %[[NEW_VEC0:.*]] = vector.broadcast %{{.*}} : vector<14x8x16xf32> to vector<1x14x8x16xf32>
   // CHECK: %[[NEW_VEC1:.*]] = vector.transpose %[[NEW_VEC0]], [1, 2, 0, 3] : vector<1x14x8x16xf32> to vector<14x8x1x16xf32>
   // CHECK: %[[NEW_RES0:.*]] = vector.transfer_write %[[NEW_VEC1]], %[[ARG1]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [false, false, true, true]} : vector<14x8x1x16xf32>, tensor<?x?x?x?xf32>
 
-  vector.transfer_write %v2, %arg0[%c0, %c0, %c0, %c0] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d2)>} : vector<8x16xf32>, memref<?x?x?x?xf32>
+  vector.transfer_write %v2, %arg0[%c0, %c0, %c0, %c0] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d2)>, in_bounds = [true, false, false, true]} : vector<8x16xf32>, memref<?x?x?x?xf32>
   // CHECK: %[[NEW_VEC2:.*]] = vector.broadcast %{{.*}} : vector<8x16xf32> to vector<1x8x16xf32>
   // CHECK: %[[NEW_VEC3:.*]] = vector.transpose %[[NEW_VEC2]], [1, 2, 0] : vector<1x8x16xf32> to vector<8x16x1xf32>
-  // CHECK: vector.transfer_write %[[NEW_VEC3]], %[[ARG0]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [false, false, true]} : vector<8x16x1xf32>, memref<?x?x?x?xf32>
+  // CHECK: vector.transfer_write %[[NEW_VEC3]], %[[ARG0]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, false, false, true]} : vector<8x16x1xf32>, memref<?x?x?x?xf32>
 
   return %0 : tensor<?x?x?x?xf32>
 }
diff --git a/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir b/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir
@@ -199,7 +199,7 @@
 func.func @transfer_read_unroll_broadcast(%arg0 : memref<6x4xf32>) -> vector<6x4xf32> {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {permutation_map = #map0} : memref<6x4xf32>, vector<6x4xf32>
+  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {in_bounds = [true, false], permutation_map = #map0} : memref<6x4xf32>, vector<6x4xf32>
   return %0 : vector<6x4xf32>
 }
 
@@ -226,7 +226,7 @@
 func.func @transfer_read_unroll_broadcast_permuation(%arg0 : memref<6x4xf32>) -> vector<4x6xf32> {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {permutation_map = #map0} : memref<6x4xf32>, vector<4x6xf32>
+  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {in_bounds = [false, true], permutation_map = #map0} : memref<6x4xf32>, vector<4x6xf32>
   return %0 : vector<4x6xf32>
 }
 
@@ -272,7 +272,7 @@
 func.func @transfer_read_unroll_different_rank(%arg0 : memref<?x?x?xf32>) -> vector<6x4xf32> {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %arg0[%c0, %c0, %c0], %cf0 {permutation_map = #map0} : memref<?x?x?xf32>, vector<6x4xf32>
+  %0 = vector.transfer_read %arg0[%c0, %c0, %c0], %cf0 {in_bounds = [false, true, false], permutation_map = #map0} : memref<?x?x?xf32>, vector<6x4xf32>
   return %0 : vector<6x4xf32>
 }
 
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -127,7 +127,7 @@
     %v = "test.dummy_op"() : () -> (vector<1xf32>)
     %v1 = "test.dummy_op"() : () -> (vector<1x1xf32>)
     vector.transfer_write %v1, %arg1[%c0, %c0] : vector<1x1xf32>, memref<1024x1024xf32>
-    vector.transfer_write %v, %arg1[%c0, %c0] : vector<1xf32>, memref<1024x1024xf32>
+    vector.transfer_write %v, %arg1[%c0, %c0] {in_bounds = [true, false]}: vector<1xf32>, memref<1024x1024xf32>
   }
   return
 }
@@ -535,7 +535,7 @@
   //     CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] {
   //     CHECK-D:   vector.transfer_write %[[R]], %{{.*}}[] : vector<f32>, memref<f32>
   vector.warp_execute_on_lane_0(%laneid)[32] {
-    %0 = vector.transfer_read %m0[%c0, %c0, %c0], %f0 {in_bounds = [true]} : memref<4x2x32xf32>, vector<32xf32>
+    %0 = vector.transfer_read %m0[%c0, %c0, %c0], %f0 {in_bounds = [true, true, true]} : memref<4x2x32xf32>, vector<32xf32>
     %1 = vector.transfer_read %m1[], %f0 : memref<f32>, vector<f32>
     %2 = vector.extractelement %1[] : vector<f32>
     %3 = vector.reduction <add>, %0 : vector<32xf32> into f32
@@ -1147,7 +1147,7 @@
     %29 = vector.extract %28[0] : vector<1x64xi32>
     %30 = arith.index_cast %29 : vector<64xi32> to vector<64xindex>
     %36 = vector.extractelement %30[%c0_i32 : i32] : vector<64xindex>
-    %37 = vector.transfer_read %ar2[%c0, %36, %c0], %cst_6 {in_bounds = [true]} : memref<1x4x1024xf32>, vector<64xf32>
+    %37 = vector.transfer_read %ar2[%c0, %36, %c0], %cst_6 {in_bounds = [true, true, true]} : memref<1x4x1024xf32>, vector<64xf32>
     vector.yield %37 : vector<64xf32>
   }
   return %18 : vector<2xf32>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir
@@ -159,11 +159,11 @@
     //
     %f0  = arith.constant 0.0 : f32
     scf.for %i = %c0 to %c8 step %c1 {
-      %v1 = vector.transfer_read %C1[%i, %c0], %f0
+      %v1 = vector.transfer_read %C1[%i, %c0], %f0 {in_bounds = [true, true]}
         : tensor<8x8xf32>, vector<8xf32>
-      %v2 = vector.transfer_read %C2[%i, %c0], %f0
+      %v2 = vector.transfer_read %C2[%i, %c0], %f0 {in_bounds = [true, true]}
         : tensor<8x8xf32>, vector<8xf32>
-      %v3 = vector.transfer_read %C3[%i, %c0], %f0
+      %v3 = vector.transfer_read %C3[%i, %c0], %f0 {in_bounds = [true, true]}
         : tensor<8x8xf32>, vector<8xf32>
       vector.print %v1 : vector<8xf32>
       vector.print %v2 : vector<8xf32>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir
@@ -124,7 +124,7 @@
     // CHECK: ( 0, 520, 0, 0, 1250 )
     //
     scf.for %i = %c0 to %c5 step %c1 {
-      %v = vector.transfer_read %0[%i, %c0], %d0: tensor<?x?xf32>, vector<5xf32>
+      %v = vector.transfer_read %0[%i, %c0], %d0 {in_bounds = [true, true]}: tensor<?x?xf32>, vector<5xf32>
       vector.print %v : vector<5xf32>
     }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir
@@ -124,12 +124,12 @@
     //
     %x = sparse_tensor.convert %0 : tensor<4x3xf64, #DCSR> to tensor<4x3xf64>
     scf.for %i = %c0 to %c4 step %c1 {
-      %v1 = vector.transfer_read %x[%i, %c0], %du: tensor<4x3xf64>, vector<3xf64>
+      %v1 = vector.transfer_read %x[%i, %c0], %du {in_bounds = [true, true]} : tensor<4x3xf64>, vector<3xf64>
       vector.print %v1 : vector<3xf64>
     }
     %y = sparse_tensor.convert %1 : tensor<4x3xf64, #DCSR> to tensor<4x3xf64>
     scf.for %i = %c0 to %c4 step %c1 {
-      %v2 = vector.transfer_read %y[%i, %c0], %du: tensor<4x3xf64>, vector<3xf64>
+      %v2 = vector.transfer_read %y[%i, %c0], %du {in_bounds = [true, true]} : tensor<4x3xf64>, vector<3xf64>
       vector.print %v2 : vector<3xf64>
     }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
@@ -65,21 +65,21 @@
     %c5 = arith.constant 5   : index
     %c6 = arith.constant 6   : index
     %c7 = arith.constant 7   : index
-    %r0 = vector.transfer_read %mat[%c0,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
+    %r0 = vector.transfer_read %mat[%c0,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32>
     vector.print %r0 : vector<8xf32>
-    %r1 = vector.transfer_read %mat[%c1,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
+    %r1 = vector.transfer_read %mat[%c1,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32>
     vector.print %r1 : vector<8xf32>
-    %r2 = vector.transfer_read %mat[%c2,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
+    %r2 = vector.transfer_read %mat[%c2,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32>
     vector.print %r2 : vector<8xf32>
-    %r3 = vector.transfer_read %mat[%c3,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
+    %r3 = vector.transfer_read %mat[%c3,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32>
     vector.print %r3 : vector<8xf32>
-    %r4 = vector.transfer_read %mat[%c4,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
+    %r4 = vector.transfer_read %mat[%c4,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32>
     vector.print %r4 : vector<8xf32>
-    %r5 = vector.transfer_read %mat[%c5,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
+    %r5 = vector.transfer_read %mat[%c5,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32>
     vector.print %r5 : vector<8xf32>
-    %r6 = vector.transfer_read %mat[%c6,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
+    %r6 = vector.transfer_read %mat[%c6,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32>
     vector.print %r6 : vector<8xf32>
-    %r7 = vector.transfer_read %mat[%c7,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
+    %r7 = vector.transfer_read %mat[%c7,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32>
     vector.print %r7 : vector<8xf32>
     return
   }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
@@ -130,18 +130,18 @@
       %quad_col = affine.apply affine_map<()[s0]->(s0 * 2)>()[%col_8x4] // account for 2xf16/col
 
       // Load quad (0, 0)
-      %A_quad00 = vector.transfer_read %argA[%quad_row, %quad_col], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16>
+      %A_quad00 = vector.transfer_read %argA[%quad_row, %quad_col], %f0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<2xf16>
 
       // Load quad (1, 0). Just shift row down 8.
       %quad_row_plus_8 = affine.apply affine_map<(d0)[]->(d0+8)>(%quad_row)[]
-      %A_quad10 = vector.transfer_read %argA[%quad_row_plus_8, %quad_col], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16>
+      %A_quad10 = vector.transfer_read %argA[%quad_row_plus_8, %quad_col], %f0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<2xf16>
 
       // Load quad (0, 1). Just shift col right 8 (4 2xf16 values)
       %quad_col_plus_8 = affine.apply affine_map<(d0)[]->(d0+8)>(%quad_col)[]
-      %A_quad01 = vector.transfer_read %argA[%quad_row, %quad_col_plus_8], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16>
+      %A_quad01 = vector.transfer_read %argA[%quad_row, %quad_col_plus_8], %f0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<2xf16>
 
       // Load quad (1, 1)
-      %A_quad11 = vector.transfer_read %argA[%quad_row_plus_8, %quad_col_plus_8], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16>
+      %A_quad11 = vector.transfer_read %argA[%quad_row_plus_8, %quad_col_plus_8], %f0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<2xf16>
 
       // Assemble the elements into a vector
       %A_init0 = arith.constant dense<0.0> : vector<4x2xf16>
@@ -166,18 +166,18 @@
       // (t) -> (t/4, t % 4). So we can re-use some of the calculation from A.
 
       // Load quad (0, 0)
-      %B_quad0 = vector.transfer_read %argB[%quad_row, %quad_col],  %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16>
+      %B_quad0 = vector.transfer_read %argB[%quad_row, %quad_col],  %f0 {in_bounds = [true, true]} : memref<8x32xf16>, vector<2xf16>
 
       // Load quad (0, 1)
-      %B_quad1 = vector.transfer_read %argB[%quad_row, %quad_col_plus_8],  %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16>
+      %B_quad1 = vector.transfer_read %argB[%quad_row, %quad_col_plus_8],  %f0 {in_bounds = [true, true]} : memref<8x32xf16>, vector<2xf16>
 
       // Load quad (0, 2)
       %quad_col_plus_16 = affine.apply affine_map<()[s0]->(s0 + 16)>()[%quad_col]
-      %B_quad2 = vector.transfer_read %argB[%quad_row, %quad_col_plus_16], %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16>
+      %B_quad2 = vector.transfer_read %argB[%quad_row, %quad_col_plus_16], %f0 {in_bounds = [true, true]} : memref<8x32xf16>, vector<2xf16>
 
       // Load quad (0, 3)
       %quad_col_plus_24 = affine.apply affine_map<()[s0]->(s0 + 24)>()[%quad_col]
-      %B_quad3 = vector.transfer_read %argB[%quad_row, %quad_col_plus_24], %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16>
+      %B_quad3 = vector.transfer_read %argB[%quad_row, %quad_col_plus_24], %f0 {in_bounds = [true, true]} : memref<8x32xf16>, vector<2xf16>
 
       // Assemble into vector
       %B_init0 = arith.constant dense<0.0> : vector<4x2xf16>
@@ -208,8 +208,8 @@
       // vector1: (tid) -> (tid / 4 + 8,  tid %4)
       %C_0 = vector.extract %d[0] : vector<2x2xf16>
       %C_1 = vector.extract %d[1] : vector<2x2xf16>
-      vector.transfer_write %C_0, %argC[%quad_row,        %quad_col] {in_bounds = [true]} : vector<2xf16>, memref<16x8xf16>
-      vector.transfer_write %C_1, %argC[%quad_row_plus_8, %quad_col] {in_bounds = [true]} : vector<2xf16>, memref<16x8xf16>
+      vector.transfer_write %C_0, %argC[%quad_row,        %quad_col] {in_bounds = [true, true]} : vector<2xf16>, memref<16x8xf16>
+      vector.transfer_write %C_1, %argC[%quad_row_plus_8, %quad_col] {in_bounds = [true, true]} : vector<2xf16>, memref<16x8xf16>
 
       gpu.return
     }
@@ -319,7 +319,7 @@
     // CHECK-NEXT: ( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 )
     //
     scf.for %pai = %c0 to %c16 step %c1 {
-      %pa0 = vector.transfer_read %a[%pai, %c0], %f0 : memref<16x16xf16>, vector<16xf16>
+      %pa0 = vector.transfer_read %a[%pai, %c0], %f0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16xf16>
       vector.print %pa0 : vector<16xf16>
     }
 
@@ -338,7 +338,7 @@
     //
     //
     scf.for %pbi = %c0 to %c8 step %c1 {
-      %pb0 = vector.transfer_read %b[%pbi, %c0], %f0 : memref<8x32xf16>, vector<32xf16>
+      %pb0 = vector.transfer_read %b[%pbi, %c0], %f0 {in_bounds = [true, true]} : memref<8x32xf16>, vector<32xf16>
       vector.print %pb0 : vector<32xf16>
     }
 
@@ -395,7 +395,7 @@
     // CHECK-NEXT: ( -6320, -5944, -5568, -5192, -4816, -4440, -4064, -3688 )
     //
     scf.for %pci = %c0 to %c16 step %c1 {
-      %pc0 = vector.transfer_read %c[%pci, %c0], %f0 : memref<16x8xf16>, vector<8xf16>
+      %pc0 = vector.transfer_read %c[%pci, %c0], %f0 {in_bounds = [true, true]} : memref<16x8xf16>, vector<8xf16>
       vector.print %pc0 : vector<8xf16>
     }
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir
@@ -21,7 +21,7 @@
 func.func @transfer_read_1d(%A : memref<?x?xf32>, %base1 : index, %base2 : index) {
   %fm42 = arith.constant -42.0: f32
   %f = vector.transfer_read %A[%base1, %base2], %fm42
-      {permutation_map = affine_map<(d0, d1) -> (d0)>}
+      {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds=[false, true]}
       : memref<?x?xf32>, vector<9xf32>
   vector.print %f: vector<9xf32>
   return
@@ -41,7 +41,7 @@
     scf.for %arg3 = %c0 to %c6 step %c3 {
       %0 = memref.subview %A[%arg2, %arg3] [1, 2] [1, 1]
           : memref<?x?xf32> to memref<1x2xf32, strided<[?, 1], offset: ?>>
-      %1 = vector.transfer_read %0[%c0, %c0], %fm42 {in_bounds=[true]}
+      %1 = vector.transfer_read %0[%c0, %c0], %fm42 {in_bounds=[true, true]}
           : memref<1x2xf32, strided<[?, 1], offset: ?>>, vector<2xf32>
       vector.print %1 : vector<2xf32>
     }
@@ -59,7 +59,7 @@
   %fm42 = arith.constant -42.0: f32
   %1 = memref.reinterpret_cast %A to offset: [%c6], sizes: [%c4, %c6],  strides: [%c6, %c1]
       : memref<?x?xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
-  %2 = vector.transfer_read %1[%c2, %c1], %fm42 {in_bounds=[true]}
+  %2 = vector.transfer_read %1[%c2, %c1], %fm42 {in_bounds=[true, true]}
       : memref<?x?xf32, strided<[?, ?], offset: ?>>, vector<4xf32>
   vector.print %2 : vector<4xf32>
   return
@@ -72,7 +72,7 @@
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %fm42 = arith.constant -42.0: f32
-  %vec = vector.transfer_read %B[%c2, %c1], %fm42 {in_bounds=[false]} : memref<4x3xf32, strided<[6, 2]>>, vector<3xf32>
+  %vec = vector.transfer_read %B[%c2, %c1], %fm42 {in_bounds=[true, false]} : memref<4x3xf32, strided<[6, 2]>>, vector<3xf32>
   vector.print %vec : vector<3xf32>
   return
 }
@@ -82,7 +82,7 @@
     %A : memref<?x?xf32>, %base1 : index, %base2 : index) {
   %fm42 = arith.constant -42.0: f32
   %f = vector.transfer_read %A[%base1, %base2], %fm42
-      {permutation_map = affine_map<(d0, d1) -> (0)>}
+      {permutation_map = affine_map<(d0, d1) -> (0)>, in_bounds=[true, true]}
       : memref<?x?xf32>, vector<9xf32>
   vector.print %f: vector<9xf32>
   return
@@ -93,7 +93,7 @@
     %A : memref<?x?xf32>, %base1 : index, %base2 : index) {
   %fm42 = arith.constant -42.0: f32
   %f = vector.transfer_read %A[%base1, %base2], %fm42
-      {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]}
+      {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true, true]}
       : memref<?x?xf32>, vector<3xf32>
   vector.print %f: vector<3xf32>
   return
@@ -105,7 +105,7 @@
   %fm42 = arith.constant -42.0: f32
   %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1, 0, 1]> : vector<9xi1>
   %f = vector.transfer_read %A[%base1, %base2], %fm42, %mask
-      {permutation_map = affine_map<(d0, d1) -> (d0)>}
+      {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds=[false, true]}
       : memref<?x?xf32>, vector<9xf32>
   vector.print %f: vector<9xf32>
   return
@@ -117,7 +117,7 @@
   %fm42 = arith.constant -42.0: f32
   %mask = arith.constant dense<[1, 0, 1]> : vector<3xi1>
   %f = vector.transfer_read %A[%base1, %base2], %fm42, %mask
-      {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]}
+      {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true, true]}
       : memref<?x?xf32>, vector<3xf32>
   vector.print %f: vector<3xf32>
   return
@@ -128,7 +128,7 @@
   %fn1 = arith.constant -1.0 : f32
   %vf0 = vector.splat %fn1 : vector<7xf32>
   vector.transfer_write %vf0, %A[%base1, %base2]
-    {permutation_map = affine_map<(d0, d1) -> (d0)>}
+    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds=[false, true]}
     : vector<7xf32>, memref<?x?xf32>
   return
 }
@@ -139,7 +139,7 @@
   %vf0 = vector.splat %fn1 : vector<7xf32>
   %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1]> : vector<7xi1>
   vector.transfer_write %vf0, %A[%base1, %base2], %mask
-    {permutation_map = affine_map<(d0, d1) -> (d0)>}
+    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds=[false, true]}
     : vector<7xf32>, memref<?x?xf32>
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir
@@ -57,7 +57,8 @@
   %fm42 = arith.constant -42.0: f32
   %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1, 0, 1]> : vector<9xi1>
   %f = vector.transfer_read %A[%base1, %base2], %fm42, %mask
-      {permutation_map = affine_map<(d0, d1) -> (0, d1)>} :
+      {permutation_map = affine_map<(d0, d1) -> (0, d1)>,
+       in_bounds = [true, false]} :
     memref<?x?xf32>, vector<4x9xf32>
   vector.print %f: vector<4x9xf32>
   return
@@ -69,7 +70,8 @@
   %fm42 = arith.constant -42.0: f32
   %mask = arith.constant dense<[1, 0, 1, 1]> : vector<4xi1>
   %f = vector.transfer_read %A[%base1, %base2], %fm42, %mask
-      {permutation_map = affine_map<(d0, d1) -> (d1, 0)>} :
+      {permutation_map = affine_map<(d0, d1) -> (d1, 0)>,
+       in_bounds = [true, false]} :
     memref<?x?xf32>, vector<4x9xf32>
   vector.print %f: vector<4x9xf32>
   return
@@ -91,7 +93,8 @@
     %A : memref<?x?xf32>, %base1: index, %base2: index) {
   %fm42 = arith.constant -42.0: f32
   %f = vector.transfer_read %A[%base1, %base2], %fm42
-      {permutation_map = affine_map<(d0, d1) -> (d1, 0)>} :
+      {permutation_map = affine_map<(d0, d1) -> (d1, 0)>,
+       in_bounds = [true, false]} :
     memref<?x?xf32>, vector<4x9xf32>
   vector.print %f: vector<4x9xf32>
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir
@@ -9,30 +9,32 @@
 // RUN: FileCheck %s
 
 func.func @transfer_read_3d(%A : memref<?x?x?x?xf32>,
-                       %o: index, %a: index, %b: index, %c: index) {
+                            %o: index, %a: index, %b: index, %c: index) {
   %fm42 = arith.constant -42.0: f32
   %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42
+      {in_bounds = [true, false, false, false]}
       : memref<?x?x?x?xf32>, vector<2x5x3xf32>
   vector.print %f: vector<2x5x3xf32>
   return
 }
 
-func.func @transfer_read_3d_and_extract(%A : memref<?x?x?x?xf32>,
-                                   %o: index, %a: index, %b: index, %c: index) {
+func.func @transfer_read_3d_and_extract(%A : memref<?x?x?x?xf32>, %o: index,
+                                        %a: index, %b: index, %c: index) {
   %fm42 = arith.constant -42.0: f32
   %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42
-      {in_bounds = [true, true, true]}
+      {in_bounds = [true, true, true, true]}
       : memref<?x?x?x?xf32>, vector<2x5x3xf32>
   %sub = vector.extract %f[0] : vector<2x5x3xf32>
   vector.print %sub: vector<5x3xf32>
   return
 }
 
-func.func @transfer_read_3d_broadcast(%A : memref<?x?x?x?xf32>,
-                                 %o: index, %a: index, %b: index, %c: index) {
+func.func @transfer_read_3d_broadcast(%A : memref<?x?x?x?xf32>, %o: index,
+                                      %a: index, %b: index, %c: index) {
   %fm42 = arith.constant -42.0: f32
   %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42
-      {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, d3)>}
+      {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, d3)>,
+       in_bounds = [true, false, true, false]}
       : memref<?x?x?x?xf32>, vector<2x5x3xf32>
   vector.print %f: vector<2x5x3xf32>
   return
@@ -43,27 +45,30 @@
   %fm42 = arith.constant -42.0: f32
   %mask = arith.constant dense<[0, 1]> : vector<2xi1>
   %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42, %mask
-      {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, 0)>}
+      {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, 0)>,
+       in_bounds = [true, false, true, true]}
       : memref<?x?x?x?xf32>, vector<2x5x3xf32>
   vector.print %f: vector<2x5x3xf32>
   return
 }
 
-func.func @transfer_read_3d_transposed(%A : memref<?x?x?x?xf32>,
-                                  %o: index, %a: index, %b: index, %c: index) {
+func.func @transfer_read_3d_transposed(%A : memref<?x?x?x?xf32>, %o: index,
+                                       %a: index, %b: index, %c: index) {
   %fm42 = arith.constant -42.0: f32
   %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42
-      {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1)>}
+      {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1)>,
+       in_bounds = [false, false, true, false]}
       : memref<?x?x?x?xf32>, vector<3x5x3xf32>
   vector.print %f: vector<3x5x3xf32>
   return
 }
 
-func.func @transfer_write_3d(%A : memref<?x?x?x?xf32>,
-                        %o: index, %a: index, %b: index, %c: index) {
+func.func @transfer_write_3d(%A : memref<?x?x?x?xf32>, %o: index, %a: index,
+                             %b: index, %c: index) {
   %fn1 = arith.constant -1.0 : f32
   %vf0 = vector.splat %fn1 : vector<2x9x3xf32>
   vector.transfer_write %vf0, %A[%o, %a, %b, %c]
+      {in_bounds = [true, false, false, false]}
       : vector<2x9x3xf32>, memref<?x?x?x?xf32>
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir
@@ -101,7 +101,7 @@
   // CHECK-SAME:   ( -42, -42, -42, -42, -42 ),
   // CHECK-SAME:   ( -42, -42, -42, -42, -42 ) )
 
-  %5 = vector.transfer_read %0[%c2, %c3], %cst {permutation_map = #map1} : memref<?x?xf32>, vector<5xf32>
+  %5 = vector.transfer_read %0[%c2, %c3], %cst {permutation_map = #map1, in_bounds = [true, false]} : memref<?x?xf32>, vector<5xf32>
   vector.print %5 : vector<5xf32>
   // CHECK-NEXT: ( 403, 503, 502, -42, -42 )