diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -550,6 +550,7 @@
                                     ubExprs, op.getContext());
   op.setUpperBounds(ranges.getOperands(), newUpperMap);
 }
+
 LogicalResult mlir::affine::normalizeAffineFor(AffineForOp op,
                                                bool promoteSingleIter) {
   if (promoteSingleIter && succeeded(promoteIfSingleIteration(op)))
@@ -571,86 +572,54 @@
   OpBuilder opBuilder(op);
   int64_t origLoopStep = op.getStep();
 
-  AffineBound lb = op.getLowerBound();
-  AffineMap originalLbMap = lb.getMap();
-  SmallVector<Value, 4> origLbOperands;
-  llvm::append_range(origLbOperands, lb.getOperands());
-
-  AffineBound ub = op.getUpperBound();
-  AffineMap originalUbMap = ub.getMap();
-  SmallVector<Value, 4> origUbOperands;
-  llvm::append_range(origUbOperands, ub.getOperands());
-
-  // Calculate upperBound for normalized loop.
-  SmallVector<Value, 4> ubOperands;
-  ubOperands.reserve(ub.getNumOperands() + lb.getNumOperands());
-
-  // Add dimension operands from upper/lower bound.
-  for (unsigned j = 0, e = originalUbMap.getNumDims(); j < e; ++j)
-    ubOperands.push_back(ub.getOperand(j));
-  for (unsigned j = 0, e = originalLbMap.getNumDims(); j < e; ++j)
-    ubOperands.push_back(lb.getOperand(j));
-
-  // Add symbol operands from upper/lower bound.
-  for (unsigned j = 0, e = originalUbMap.getNumSymbols(); j < e; ++j)
-    ubOperands.push_back(ub.getOperand(originalUbMap.getNumDims() + j));
-  for (unsigned j = 0, e = originalLbMap.getNumSymbols(); j < e; ++j)
-    ubOperands.push_back(lb.getOperand(originalLbMap.getNumDims() + j));
-
-  // Add original result expressions from lower/upper bound map.
-  SmallVector<AffineExpr, 1> origLbExprs(originalLbMap.getResults().begin(),
-                                         originalLbMap.getResults().end());
-  SmallVector<AffineExpr, 2> origUbExprs(originalUbMap.getResults().begin(),
-                                         originalUbMap.getResults().end());
-  SmallVector<AffineExpr, 4> newUbExprs;
-
-  // The original upperBound can have more than one result. For the new
-  // upperBound of this loop, take difference of all possible combinations of
-  // the ub results and lb result and ceildiv with the loop step. For e.g.,
-  //
-  //  affine.for %i1 = 0 to min affine_map<(d0)[] -> (d0 + 32, 1024)>(%i0)
-  //  will have an upperBound map as,
-  //  affine_map<(d0)[] -> (((d0 + 32) - 0) ceildiv 1, (1024 - 0) ceildiv
-  //  1)>(%i0)
-  //
-  // Insert all combinations of upper/lower bound results.
-  for (unsigned i = 0, e = origUbExprs.size(); i < e; ++i) {
-    newUbExprs.push_back(
-        (origUbExprs[i] - origLbExprs[0]).ceilDiv(origLoopStep));
-  }
-
-  // Construct newUbMap.
-  AffineMap newUbMap = AffineMap::get(
-      originalLbMap.getNumDims() + originalUbMap.getNumDims(),
-      originalLbMap.getNumSymbols() + originalUbMap.getNumSymbols(), newUbExprs,
-      opBuilder.getContext());
-  canonicalizeMapAndOperands(&newUbMap, &ubOperands);
-
-  SmallVector<Value, 4> lbOperands(lb.getOperands().begin(),
-                                   lb.getOperands().begin() +
-                                       originalLbMap.getNumDims());
-
-  // Normalize the loop.
-  op.setUpperBound(ubOperands, newUbMap);
+  // Construct the new upper bound value map.
+  AffineMap oldLbMap = op.getLowerBoundMap();
+  // The upper bound can have multiple results. To use
+  // AffineValueMap::difference, we need to have the same number of results in
+  // both lower and upper bound maps. So, we just create a value map for the
+  // lower bound with the only available lower bound result repeated to pad up
+  // to the number of upper bound results.
+  SmallVector<AffineExpr> lbExprs(op.getUpperBoundMap().getNumResults(),
+                                  op.getLowerBoundMap().getResult(0));
+  AffineValueMap lbMap(oldLbMap, op.getLowerBoundOperands());
+  AffineMap paddedLbMap =
+      AffineMap::get(oldLbMap.getNumDims(), oldLbMap.getNumSymbols(), lbExprs,
+                     op.getContext());
+  AffineValueMap paddedLbValueMap(paddedLbMap, op.getLowerBoundOperands());
+  AffineValueMap ubValueMap(op.getUpperBoundMap(), op.getUpperBoundOperands());
+  AffineValueMap newUbValueMap;
+  // Compute the `upper bound - lower bound`.
+  AffineValueMap::difference(ubValueMap, paddedLbValueMap, &newUbValueMap);
+  (void)newUbValueMap.canonicalize();
+
+  // Scale down the upper bound value map by the loop step.
+  unsigned numResult = newUbValueMap.getNumResults();
+  SmallVector<AffineExpr> scaleDownExprs(numResult);
+  for (unsigned i = 0; i < numResult; ++i)
+    scaleDownExprs[i] = opBuilder.getAffineDimExpr(i).ceilDiv(origLoopStep);
+  // `scaleDownMap` is (d0, d1, ..., d_n) -> (d0 / step, d1 / step, ..., d_n /
+  // step). Where `n` is the number of results in the upper bound map.
+  AffineMap scaleDownMap =
+      AffineMap::get(numResult, 0, scaleDownExprs, op.getContext());
+  AffineMap newUbMap = scaleDownMap.compose(newUbValueMap.getAffineMap());
+
+  // Set the newly create upper bound map and operands.
+  op.setUpperBound(newUbValueMap.getOperands(), newUbMap);
   op.setLowerBound({}, opBuilder.getConstantAffineMap(0));
   op.setStep(1);
 
   // Calculate the Value of new loopIV. Create affine.apply for the value of
   // the loopIV in normalized loop.
   opBuilder.setInsertionPointToStart(op.getBody());
-  // Add an extra dim operand for loopIV.
-  lbOperands.push_back(op.getInductionVar());
-  // Add symbol operands from lower bound.
-  for (unsigned j = 0, e = originalLbMap.getNumSymbols(); j < e; ++j)
-    lbOperands.push_back(origLbOperands[originalLbMap.getNumDims() + j]);
-
-  AffineExpr origIVExpr =
-      opBuilder.getAffineDimExpr(originalLbMap.getNumDims());
-  AffineExpr newIVExpr = origIVExpr * origLoopStep + originalLbMap.getResult(0);
-  AffineMap ivMap = AffineMap::get(originalLbMap.getNumDims() + 1,
-                                   originalLbMap.getNumSymbols(), newIVExpr);
-  canonicalizeMapAndOperands(&ivMap, &lbOperands);
-  Operation *newIV = opBuilder.create<AffineApplyOp>(loc, ivMap, lbOperands);
+  // Construct an affine.apply op mapping the new IV to the old IV.
+  AffineMap scaleIvMap =
+      AffineMap::get(1, 0, -opBuilder.getAffineDimExpr(0) * origLoopStep);
+  AffineValueMap scaleIvValueMap(scaleIvMap, ValueRange{op.getInductionVar()});
+  AffineValueMap newIvToOldIvMap;
+  AffineValueMap::difference(lbMap, scaleIvValueMap, &newIvToOldIvMap);
+  (void)newIvToOldIvMap.canonicalize();
+  auto newIV = opBuilder.create<AffineApplyOp>(
+      loc, newIvToOldIvMap.getAffineMap(), newIvToOldIvMap.getOperands());
   op.getInductionVar().replaceAllUsesExcept(newIV->getResult(0), newIV);
   return success();
 }
diff --git a/mlir/test/Dialect/Affine/affine-loop-normalize.mlir b/mlir/test/Dialect/Affine/affine-loop-normalize.mlir
--- a/mlir/test/Dialect/Affine/affine-loop-normalize.mlir
+++ b/mlir/test/Dialect/Affine/affine-loop-normalize.mlir
@@ -88,7 +88,7 @@
 
 // CHECK-LABEL: func @loop_with_unknown_upper_bound
 // CHECK-SAME: (%[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: index)
-// CHECK-NEXT:  %{{.*}} = arith.constant 0 : index
+// CHECK-NEXT:  arith.constant 0 : index
 // CHECK-NEXT:  %[[DIM:.*]] = memref.dim %arg0, %c0 : memref<?x?xf32>
 // CHECK-NEXT:   affine.for %[[I:.*]] = 0 to [[$UB00]]()[%[[DIM]]] {
 // CHECK-NEXT:     %[[IIV:.*]] = affine.apply [[$IV00]](%[[I]])
@@ -119,7 +119,7 @@
 
 // CHECK-LABEL: func @loop_with_multiple_upper_bounds
 // CHECK-SAME: (%[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: index)
-// CHECK-NEXT:  %{{.*}} = arith.constant 0 : index
+// CHECK-NEXT:  arith.constant 0 : index
 // CHECK-NEXT:  %[[DIM:.*]] = memref.dim %arg0, %c0 : memref<?x?xf32>
 // CHECK-NEXT:   affine.for %[[I:.*]] = 0 to [[$OUTERUB]]()[%[[DIM]]] {
 // CHECK-NEXT:     %[[IIV:.*]] = affine.apply [[$OUTERIV]](%[[I]])
@@ -146,12 +146,12 @@
 // CHECK-DAG: [[$INTERUB:#map[0-9]*]] = affine_map<()[s0] -> (s0 ceildiv 32)>
 // CHECK-DAG: [[$INTERIV:#map[0-9]*]] = affine_map<(d0) -> (d0 * 32)>
 // CHECK-DAG: [[$INTRAUB:#map[0-9]*]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
-// CHECK-DAG: [[$INTRAIV:#map[0-9]*]] = affine_map<(d0, d1) -> (d1 + d0)>
+// CHECK-DAG: [[$INTRAIV:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + d1)>
 
 // CHECK-LABEL: func @tiled_matmul
 // CHECK-SAME: (%[[ARG0:.*]]: memref<1024x1024xf32>, %[[ARG1:.*]]: memref<1024x1024xf32>, %[[ARG2:.*]]: memref<1024x1024xf32>)
-// CHECK-NEXT:    %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT:    %{{.*}} = arith.constant 1 : index
+// CHECK-NEXT:    arith.constant 0 : index
+// CHECK-NEXT:    arith.constant 1 : index
 // CHECK-NEXT:    %[[DIM0:.*]] = memref.dim %[[ARG0]], %{{.*}}
 // CHECK-NEXT:    %[[DIM1:.*]] = memref.dim %[[ARG1]], %{{.*}}
 // CHECK-NEXT:    %[[DIM2:.*]] = memref.dim %[[ARG0]], %{{.*}}
@@ -167,11 +167,11 @@
 // CHECK-NEXT:              %[[JJIV:.*]] = affine.apply [[$INTRAIV]](%[[JIV]], %[[JJ]])
 // CHECK-NEXT:              affine.for %[[KK:.*]] = 0 to min [[$INTRAUB]](%[[KIV]])[%[[DIM2]]] {
 // CHECK-NEXT:                %[[KKIV:.*]] = affine.apply [[$INTRAIV]](%[[KIV]], %[[KK]])
-// CHECK-NEXT:                %{{.*}} = affine.load %[[ARG0]][%[[IIIV]], %[[KKIV]]] : memref<1024x1024xf32>
-// CHECK-NEXT:                %{{.*}} = affine.load %[[ARG1]][%[[KKIV]], %[[JJIV]]] : memref<1024x1024xf32>
-// CHECK-NEXT:                %{{.*}} = affine.load %[[ARG2]][%[[IIIV]], %[[JJIV]]] : memref<1024x1024xf32>
-// CHECK-NEXT:                %{{.*}} = arith.mulf
-// CHECK-NEXT:                %{{.*}} = arith.addf
+// CHECK-NEXT:                affine.load %[[ARG0]][%[[IIIV]], %[[KKIV]]] : memref<1024x1024xf32>
+// CHECK-NEXT:                affine.load %[[ARG1]][%[[KKIV]], %[[JJIV]]] : memref<1024x1024xf32>
+// CHECK-NEXT:                affine.load %[[ARG2]][%[[IIIV]], %[[JJIV]]] : memref<1024x1024xf32>
+// CHECK-NEXT:                arith.mulf
+// CHECK-NEXT:                arith.addf
 // CHECK-NEXT:                affine.store %{{.*}}, %[[ARG2]]{{.*}} : memref<1024x1024xf32>
 // CHECK-NEXT:              }
 // CHECK-NEXT:            }
@@ -223,7 +223,7 @@
   scf.for %j = %c0 to %c1 step %c1 {
     // CHECK: affine.for %[[ARG0:.*]] =
     affine.for %i = %c0 to %c1 {
-      // CHECK-NEXT: %{{.*}} = affine.apply #map{{.*}}(%[[ARG0]])
+      // CHECK-NEXT: affine.apply #map{{.*}}(%[[ARG0]])
     }
   }
   return
@@ -292,3 +292,34 @@
   }
   return
 }
+
+// -----
+
+// CHECK: [[$MAP:#map[0-9]*]] = affine_map<(d0) -> (d0 * 64)>
+// CHECK: [[$MAP1:#map[0-9]*]] = affine_map<(d0) -> (2, (-d0 + 1024) ceildiv 32)>
+// CHECK: [[$MAP2:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + d1 * 32)>
+// CHECK: [[$MAP3:#map[0-9]*]] = affine_map<(d0, d1) -> (32, d0 - d1 + 64, -d1 + 1024)>
+// CHECK: [[$MAP4:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + d1)>
+#map0 = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0) -> (d0 + 64, 1024)>
+#map2 = affine_map<(d0, d1) -> (d1 + 32, d0 + 64, 1024)>
+// CHECK-LABEL: @muli_level_tiled_matmul()
+func.func @muli_level_tiled_matmul() {
+  // CHECK-NEXT:  %[[BUF:.*]] = memref.alloc() : memref<1024xf16>
+  %0 = memref.alloc() : memref<1024xf16>
+  affine.for %arg0 = 0 to 1024 step 64 {
+    // CHECK-NEXT:  affine.for %[[ARG0:.*]] = 0 to 16 {
+    // CHECK-NEXT:    %[[IV0:.*]] = affine.apply [[$MAP]](%[[ARG0]])
+    affine.for %arg3 = #map0(%arg0) to min #map1(%arg0) step 32 {
+      // CHECK-NEXT:    affine.for %[[ARG1:.*]] = 0 to min [[$MAP1]](%[[IV0]]) {
+      // CHECK-NEXT:      %[[IV1:.*]] = affine.apply [[$MAP2]](%[[IV0]], %[[ARG1]])
+      affine.for %arg6 = #map0(%arg3) to min #map2(%arg0, %arg3) {
+        // CHECK-NEXT:      affine.for %[[ARG2:.*]] = 0 to min [[$MAP3]](%[[IV0]], %[[IV1]]) {
+        // CHECK-NEXT:        %[[IV2:.*]] = affine.apply [[$MAP4]](%[[IV1]], %[[ARG2]])
+        // CHECK-NEXT:        affine.load %[[BUF]][%[[IV2]]] : memref<1024xf16>
+        %3 = affine.load %0[%arg6] : memref<1024xf16>
+      }
+    }
+  }
+  return
+}