diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -821,9 +821,9 @@
       Value maxIndex = applyMapToValues(builder, loc, m, maxIndices).front();
       Value d = makeComposedAffineApply(builder, loc, plusOneMap, {maxIndex});
 
-      // Compute min(size, dim - offset) to avoid out-of-bounds accesses.
+      // Compute min(dim - offset, size) to avoid out-of-bounds accesses.
       AffineMap minMap = AffineMap::inferFromExprList(
-                             {ArrayRef<AffineExpr>{dim0, dim1 - dim2}})
+                             {ArrayRef<AffineExpr>{dim1 - dim2, dim0}})
                              .front();
       SmallVector<Value, 4> operands{size, d, offset};
       fullyComposeAffineMapAndOperands(&minMap, &operands);
diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir
--- a/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir
@@ -1,16 +1,16 @@
 // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=MATMUL %s
 // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.generic fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=GENERIC %s
 
-//  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (5, -d0 + 24)>
-//  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (7, -d0 + 12)>
-//  MATMUL-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 24)>
-//  MATMUL-DAG:  #[[MAP3:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 12)>
+//  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 24, 5)>
+//  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 12, 7)>
+//  MATMUL-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1) -> (-d1 + 24, d0)>
+//  MATMUL-DAG:  #[[MAP3:.*]] = affine_map<(d0, d1) -> (-d1 + 12, d0)>
 
 //      MATMUL:  fuse_input
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 func.func @fuse_input(%arg0: tensor<24x12xf32>,
-                         %arg1: tensor<12x25xf32>,
-                         %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
+                      %arg1: tensor<12x25xf32>,
+                      %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
@@ -34,19 +34,19 @@
   //      MATMUL:        %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
   //      MATMUL:        %{{.*}} = linalg.matmul ins(%[[T1]]
   %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
-  return %1 : tensor<24x25xf32>
+  func.return %1 : tensor<24x25xf32>
 }
 
 // -----
 
-//  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (5, -d0 + 24)>
-//  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (4, -d0 + 25)>
+//  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 24, 5)>
+//  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 25, 4)>
 
 //      MATMUL:  fuse_output
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
 func.func @fuse_output(%arg0: tensor<24x12xf32>,
-                          %arg1: tensor<12x25xf32>,
-                          %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
+                       %arg1: tensor<12x25xf32>,
+                       %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   //  MATMUL-DAG:  %[[C0:.*]] = arith.constant 0 : index
   //  MATMUL-DAG:  %[[C1:.*]] = arith.constant 1 : index
   %c0 = arith.constant 0 : index
@@ -81,15 +81,15 @@
   // MATMUL-SAME:                                            0, 0
   // MATMUL-SAME:                                            %[[D0]], %[[D1]]
   %1 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%0 : tensor<24x25xf32>) -> tensor<24x25xf32>
-  return %1 : tensor<24x25xf32>
+  func.return %1 : tensor<24x25xf32>
 }
 
 // -----
 
-//  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (4, -d0 + 25)>
-//  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (7, -d0 + 12)>
-//  MATMUL-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 25)>
-//  MATMUL-DAG:  #[[MAP3:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 12)>
+//  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 25, 4)>
+//  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 12, 7)>
+//  MATMUL-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1) -> (-d1 + 25, d0)>
+//  MATMUL-DAG:  #[[MAP3:.*]] = affine_map<(d0, d1) -> (-d1 + 12, d0)>
 #map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
 
@@ -97,16 +97,16 @@
 // MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
 // MATMUL-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x7x25xf32>
 func.func @fuse_reduction(%arg0: tensor<24x12xf32>,
-                             %arg1: tensor<12x25xf32>,
-                             %arg2: tensor<24x25xf32>,
-                             %arg3: tensor<12x7x25xf32>) -> tensor<24x25xf32> {
+                          %arg1: tensor<12x25xf32>,
+                          %arg2: tensor<24x25xf32>,
+                          %arg3: tensor<12x7x25xf32>) -> tensor<24x25xf32> {
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
   %c24 = arith.constant 24 : index
   %c4 = arith.constant 4 : index
   %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg3 : tensor<12x7x25xf32>) outs(%arg1 : tensor<12x25xf32>) {
-  ^bb0(%arg4: f32, %arg5: f32):  
+  ^bb0(%arg4: f32, %arg5: f32):
     %2 = arith.addf %arg4, %arg5 : f32
     linalg.yield %2 : f32
   } -> tensor<12x25xf32>
@@ -129,7 +129,7 @@
   //      MATMUL:        %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]]
   //      MATMUL:        %{{.*}} = linalg.matmul ins(%{{.*}}, %[[T2]]
   %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
-  return %1 : tensor<24x25xf32>
+  func.return %1 : tensor<24x25xf32>
 }
 
 // -----
@@ -141,16 +141,16 @@
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 // MATMUL-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x24xf32>
 func.func @fuse_transposed(%arg0: tensor<24x12xf32>,
-                              %arg1: tensor<12x25xf32>,
-                              %arg2: tensor<24x25xf32>,
-                              %arg3: tensor<12x24xf32>) -> tensor<24x25xf32> {
+                           %arg1: tensor<12x25xf32>,
+                           %arg2: tensor<24x25xf32>,
+                           %arg3: tensor<12x24xf32>) -> tensor<24x25xf32> {
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
   %c24 = arith.constant 24 : index
   %c4 = arith.constant 4 : index
   %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3 : tensor<12x24xf32>) outs(%arg0 : tensor<24x12xf32>) {
-  ^bb0(%arg4: f32, %arg5: f32):  
+  ^bb0(%arg4: f32, %arg5: f32):
     %2 = arith.addf %arg4, %arg5 : f32
     linalg.yield %2 : f32
   } -> tensor<24x12xf32>
@@ -167,7 +167,7 @@
   //      MATMUL:        %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]]
   //      MATMUL:        %{{.*}} = linalg.matmul ins(%[[T2]]
   %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
-  return %1 : tensor<24x25xf32>
+  func.return %1 : tensor<24x25xf32>
 }
 
 // -----
@@ -176,8 +176,8 @@
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
 func.func @fuse_input_and_output(%arg0: tensor<24x12xf32>,
-                                    %arg1: tensor<12x25xf32>,
-                                    %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
+                                 %arg1: tensor<12x25xf32>,
+                                 %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
@@ -200,7 +200,7 @@
   //      MATMUL:          %[[T4:.*]] = tensor.extract_slice %[[ARG5]]
   //      MATMUL:          %{{.*}} = linalg.matmul ins(%[[T3]], {{.*}} outs(%[[T4]]
   %2 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%1 : tensor<24x25xf32>) -> tensor<24x25xf32>
-  return %2 : tensor<24x25xf32>
+  func.return %2 : tensor<24x25xf32>
 }
 
 // -----
@@ -211,15 +211,15 @@
 //      MATMUL:  fuse_indexed
 // MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xi32>
 func.func @fuse_indexed(%arg0: tensor<24x12xi32>,
-                           %arg1: tensor<12x25xi32>,
-                           %arg2: tensor<24x25xi32>) -> tensor<24x25xi32> {
+                        %arg1: tensor<12x25xi32>,
+                        %arg2: tensor<24x25xi32>) -> tensor<24x25xi32> {
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
   %c24 = arith.constant 24 : index
   %c4 = arith.constant 4 : index
   %0 = linalg.generic {indexing_maps = [#map0], iterator_types = ["parallel", "parallel"]} outs(%arg1 : tensor<12x25xi32>) {
-  ^bb0(%arg3: i32):  
+  ^bb0(%arg3: i32):
     %6 = linalg.index 0 : index
     %7 = linalg.index 1 : index
     %8 = arith.addi %6, %7 : index
@@ -241,7 +241,7 @@
   //      MATMUL:  %[[IDX1_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX1]], %[[IV2]])
   //      MATMUL:  %{{.*}} = arith.addi %[[IDX0_SHIFTED]], %[[IDX1_SHIFTED]]
   %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xi32>, tensor<12x25xi32>) outs(%arg2 : tensor<24x25xi32>) -> tensor<24x25xi32>
-  return %1 : tensor<24x25xi32>
+  func.return %1 : tensor<24x25xi32>
 }
 
 // -----
@@ -252,8 +252,8 @@
 //      GENERIC:  fuse_outermost_reduction
 // GENERIC-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
 // GENERIC-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<10xf32>
-func @fuse_outermost_reduction(%arg0: tensor<10x17xf32>,
-                               %arg1: tensor<10xf32>) -> tensor<10xf32> {
+func.func @fuse_outermost_reduction(%arg0: tensor<10x17xf32>,
+                                    %arg1: tensor<10xf32>) -> tensor<10xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<10x17xf32>) -> tensor<10x17xf32>
 
@@ -272,25 +272,25 @@
   // GENERIC-SAME:                                        %[[IV1]]
   //      GENERIC:  linalg.generic {{.*}} ins(%[[T2]] {{.*}} outs(%[[T3]]
   %2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<10x17xf32>) outs(%1 : tensor<10xf32>) {
-  ^bb0(%arg2: f32, %arg3: f32):  
+  ^bb0(%arg2: f32, %arg3: f32):
     %3 = arith.addf %arg2, %arg3 : f32
     linalg.yield %3 : f32
   } -> tensor<10xf32>
-  return %2 : tensor<10xf32>
+  func.return %2 : tensor<10xf32>
 }
 
 // -----
 
 //  GENERIC-DAG:  #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
-//  GENERIC-DAG:  #[[MAP1:.*]] = affine_map<(d0, d1) -> (8, -d0 - d1 + 17)>
-//  GENERIC-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0, -d1 - d2 + 17)>
+//  GENERIC-DAG:  #[[MAP1:.*]] = affine_map<(d0, d1) -> (-d0 - d1 + 17, 8)>
+//  GENERIC-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (-d1 - d2 + 17, d0)>
 #map0 = affine_map<(d0, d1) -> (d0, d0 + d1)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 
 //      GENERIC:  fuse_non_rectangular
 // GENERIC-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
-func @fuse_non_rectangular(%arg0: tensor<10x17xf32>,
-                           %arg1: tensor<10x8xf32>) -> tensor<10x8xf32> {
+func.func @fuse_non_rectangular(%arg0: tensor<10x17xf32>,
+                                %arg1: tensor<10x8xf32>) -> tensor<10x8xf32> {
 
   //  GENERIC-DAG:  %[[C0:.*]] = arith.constant 0 : index
   //  GENERIC-DAG:  %[[C4:.*]] = arith.constant 4 : index
@@ -315,9 +315,9 @@
   // GENERIC-SAME:                                                , %[[UB1]]
   //      GENERIC:      %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
   %1 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x17xf32>) outs(%arg1 : tensor<10x8xf32>) {
-  ^bb0(%arg2: f32, %arg3: f32):  
+  ^bb0(%arg2: f32, %arg3: f32):
     %2 = arith.addf %arg2, %arg3 : f32
     linalg.yield %2 : f32
   } -> tensor<10x8xf32>
-  return %1 : tensor<10x8xf32>
+  func.return %1 : tensor<10x8xf32>
 }