diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -512,6 +512,15 @@
   assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops");
 }
 
+static Value fullyComposeAndAffineApply(OpBuilder &b, Location loc,
+                                        AffineExpr expr, ValueRange operands) {
+  AffineMap map = AffineMap::inferFromExprList({expr}).front();
+  SmallVector<Value> normalizedOperands(operands.begin(), operands.end());
+  mlir::fullyComposeAffineMapAndOperands(&map, &normalizedOperands);
+  canonicalizeMapAndOperands(&map, &normalizedOperands);
+  return b.createOrFold<AffineApplyOp>(loc, map, normalizedOperands);
+}
+
 Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
                      ValueRange tileSizes, AffineMap map, ValueRange lbs,
                      ValueRange ubs, ValueRange subShapeSizes) {
@@ -547,16 +556,21 @@
         applyMapToValues(builder, loc, m, subShapeSizes).front();
     // Resulting size needs to be made half open interval again.
     AffineExpr s0 = getAffineSymbolExpr(0, builder.getContext());
-    Value size = makeComposedAffineApply(builder, loc, s0 + 1, closedIntSize);
+    Value size =
+        fullyComposeAndAffineApply(builder, loc, s0 + 1, closedIntSize);
     LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: raw size: " << size << "\n");
 
     // The size of the subview / extract_slice should be trimmed to avoid
-    // out-of-bounds accesses, unless we statically know the subshape size
-    // divides the shape size evenly.
+    // out-of-bounds accesses, unless:
+    // a. We statically know the subshape size divides the shape size evenly.
+    // b. The subshape size is 1. According to the way the loops are set up,
+    //    tensors with "0" dimensions would never be constructed.
     int64_t shapeSize = shape[r];
     auto sizeCst = size.getDefiningOp<ConstantIndexOp>();
-    if (ShapedType::isDynamic(shapeSize) || !sizeCst ||
-        (shapeSize % sizeCst.getValue()) != 0) {
+    auto hasTileSizeOne = sizeCst && sizeCst.getValue() == 1;
+    auto dividesEvenly = sizeCst && !ShapedType::isDynamic(shapeSize) &&
+                         ((shapeSize % sizeCst.getValue()) == 0);
+    if (!hasTileSizeOne && !dividesEvenly) {
       LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: shapeSize=" << shapeSize
                               << ", size: " << size
                               << ": make sure in bound with affine.min\n");
@@ -570,6 +584,7 @@
       Value d = applyMapToValues(builder, loc, m, ubs).front();
       SmallVector<Value, 4> operands{size, d, offset};
       fullyComposeAffineMapAndOperands(&minMap, &operands);
+      canonicalizeMapAndOperands(&minMap, &operands);
       size = builder.create<AffineMinOp>(loc, builder.getIndexType(), minMap,
                                          operands);
     }
@@ -616,7 +631,7 @@
     // Before composing, we need to make range a closed interval.
     Value size = isTiled ? tileSizes[idx] : sizeBounds[idx];
     AffineExpr d0 = getAffineDimExpr(0, b.getContext());
-    sizes.push_back(makeComposedAffineApply(b, loc, d0 - 1, size));
+    sizes.push_back(fullyComposeAndAffineApply(b, loc, d0 - 1, size));
     LLVM_DEBUG(llvm::dbgs() << "computeTileSizes: " << sizes.back() << "\n");
   }
   return sizes;
diff --git a/mlir/test/Dialect/Linalg/fusion-sequence.mlir b/mlir/test/Dialect/Linalg/fusion-sequence.mlir
--- a/mlir/test/Dialect/Linalg/fusion-sequence.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-sequence.mlir
@@ -212,7 +212,6 @@
   }
 }
 
-//       CHaECK: #[[MAP0:.+]] = affine_map<(d0, d1) -> (16, d0 - d1)>
 //       CHECK: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
 //       CHECK: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s0, 16, -d0 + s1)>
 
diff --git a/mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir b/mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir
--- a/mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir
+++ b/mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir
@@ -25,3 +25,50 @@
       outs(%out: tensor<?x2000xf32>) -> tensor<?x2000xf32>
   return %r : tensor<?x2000xf32>
 }
+
+// -----
+
+// The input IR of this test case is a tiled and peeled linalg.matmul op.
+
+// CHECK-LABEL: func @tiled_and_peeled_matmul(
+//       CHECK:   linalg.matmul ins({{.*}} : tensor<32x259xf32>, tensor<259x258xf32>) outs({{.*}} : tensor<32x258xf32>) -> tensor<32x258xf32>
+//       CHECK:   linalg.matmul ins({{.*}} : tensor<1x259xf32>, tensor<259x258xf32>) outs({{.*}} : tensor<1x258xf32>) -> tensor<1x258xf32>
+#map0 = affine_map<(d0) -> (64, -d0 + 257)>
+#map1 = affine_map<()[s0] -> ((s0 floordiv 32) * 32)>
+#map2 = affine_map<(d0)[s0] -> (d0 - (s0 floordiv 32) * 32)>
+
+func @tiled_and_peeled_matmul(%arg0: tensor<257x259xf32>, %arg1: tensor<259x258xf32>, %arg2: tensor<257x258xf32>) -> tensor<257x258xf32> {
+  %c257 = constant 257 : index
+  %c64 = constant 64 : index
+  %cst = constant 0.000000e+00 : f32
+  %c0 = constant 0 : index
+  %c32 = constant 32 : index
+  %0 = linalg.fill(%cst, %arg2) : f32, tensor<257x258xf32> -> tensor<257x258xf32>
+  %1 = scf.for %arg3 = %c0 to %c257 step %c64 iter_args(%arg4 = %0) -> (tensor<257x258xf32>) {
+    %2 = affine.min #map0(%arg3)
+    %3 = tensor.extract_slice %arg0[%arg3, 0] [%2, 259] [1, 1] : tensor<257x259xf32> to tensor<?x259xf32>
+    %4 = tensor.extract_slice %arg4[%arg3, 0] [%2, 258] [1, 1] : tensor<257x258xf32> to tensor<?x258xf32>
+    %5 = affine.apply #map1()[%2]
+    %6 = scf.for %arg5 = %c0 to %5 step %c32 iter_args(%arg6 = %4) -> (tensor<?x258xf32>) {
+      %10 = tensor.extract_slice %3[%arg5, 0] [32, 259] [1, 1] : tensor<?x259xf32> to tensor<32x259xf32>
+      %11 = tensor.extract_slice %arg6[%arg5, 0] [32, 258] [1, 1] : tensor<?x258xf32> to tensor<32x258xf32>
+      %12 = linalg.matmul {__internal_linalg_transform__ = "tile"} ins(%10, %arg1 : tensor<32x259xf32>, tensor<259x258xf32>) outs(%11 : tensor<32x258xf32>) -> tensor<32x258xf32>
+      %13 = tensor.insert_slice %12 into %arg6[%arg5, 0] [32, 258] [1, 1] : tensor<32x258xf32> into tensor<?x258xf32>
+      scf.yield %13 : tensor<?x258xf32>
+    }
+    %7 = cmpi slt, %5, %2 : index
+    %8 = scf.if %7 -> (tensor<?x258xf32>) {
+      %10 = affine.apply #map2(%2)[%2]
+      %11 = tensor.extract_slice %3[%5, 0] [%10, 259] [1, 1] : tensor<?x259xf32> to tensor<?x259xf32>
+      %12 = tensor.extract_slice %6[%5, 0] [%10, 258] [1, 1] : tensor<?x258xf32> to tensor<?x258xf32>
+      %13 = linalg.matmul {__internal_linalg_transform__ = "tile"} ins(%11, %arg1 : tensor<?x259xf32>, tensor<259x258xf32>) outs(%12 : tensor<?x258xf32>) -> tensor<?x258xf32>
+      %14 = tensor.insert_slice %13 into %6[%5, 0] [%10, 258] [1, 1] : tensor<?x258xf32> into tensor<?x258xf32>
+      scf.yield %14 : tensor<?x258xf32>
+    } else {
+      scf.yield %6 : tensor<?x258xf32>
+    }
+    %9 = tensor.insert_slice %8 into %arg4[%arg3, 0] [%2, 258] [1, 1] : tensor<?x258xf32> into tensor<257x258xf32>
+    scf.yield %9 : tensor<257x258xf32>
+  }
+  return %1 : tensor<257x258xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir
--- a/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir
+++ b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-1d | FileCheck %s
-// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-2d | FileCheck %s
+// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-1d | FileCheck %s -check-prefix=CHECK-1D
+// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-2d | FileCheck %s -check-prefix=CHECK-2D
 
 func @matmul(%A: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>,
                   %B: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>,
@@ -11,17 +11,36 @@
   return
 }
 
-// CHECK-LABEL:func @matmul
-//      CHECK: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
-//      CHECK: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
-//      CHECK: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
+// CHECK-1D-LABEL:func @matmul
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
 //
-//      CHECK: linalg.copy
-//      CHECK: linalg.copy
-//      CHECK: linalg.copy
+//      CHECK-1D: vector.transfer_read {{.*}} : memref<8x16xf32, #{{.*}}>, vector<8x16xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32, #{{.*}}>
+//      CHECK-1D: vector.transfer_read {{.*}} : memref<16x12xf32, #{{.*}}>, vector<16x12xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32, #{{.*}}>
+//      CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32, #{{.*}}>, vector<8x12xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32, #{{.*}}>
 //
-//      CHECK: vector.contract
-// CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction"]
-// CHECK-SAME:   : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32>
+//      CHECK-1D: vector.contract
+// CHECK-1D-SAME:   iterator_types = ["parallel", "parallel", "reduction"]
+// CHECK-1D-SAME:   : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32>
 //
-//      CHECK: linalg.copy
+//      CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32, #{{.*}}>, vector<8x12xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32, #{{.*}}>
+
+// CHECK-2D-LABEL:func @matmul
+//      CHECK-2D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
+//      CHECK-2D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
+//      CHECK-2D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
+//
+//      CHECK-2D: linalg.copy
+//      CHECK-2D: linalg.copy
+//      CHECK-2D: linalg.copy
+//
+//      CHECK-2D: vector.contract
+// CHECK-2D-SAME:   iterator_types = ["parallel", "parallel", "reduction"]
+// CHECK-2D-SAME:   : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32>
+//
+//      CHECK-2D: linalg.copy