diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -243,6 +243,25 @@
 /// independent of any loop induction variable involved in the nest.
 void coalesceLoops(MutableArrayRef<scf::ForOp> loops);
 
+/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
+/// `loops` contains a list of perfectly nested loops that are normalized (step
+/// one and lower bound of zero) and with bounds and steps independent of any
+/// loop induction variable involved in the nest. Coalescing affine.for loops is
+/// not always possible, i.e., the result may not be representable using
+/// affine.for.
+LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops);
+
+/// Vectorizes a loop (either outer or inner, with a perfect or imperfectly
+/// nested body). `simdWidth` is the bit width of the vectors on target. `forOp`
+/// is never invalidated. This method does not check for contiguity of accesses
+/// or validity of vectorization. A memref.vector_cast op is used to create a
+/// vector element type version of the memref. If the memref involved with the
+/// vectorization is a function argument attribute, its `memref.alignment`
+/// argument attribute is checked to determine if it has the desired alignment.
+/// If not, the loop isn't vectorize and this returns failure.
+LogicalResult loopVectorize(AffineForOp forOp, unsigned simdWidth,
+                            DenseMap<Value, Value> *vecMemRefs = nullptr);
+
 /// Take the ParallelLoop and for each set of dimension indices, combine them
 /// into a single dimension. combinedDimensions must contain each index into
 /// loops exactly once.
diff --git a/mlir/lib/Transforms/LoopCoalescing.cpp b/mlir/lib/Transforms/LoopCoalescing.cpp
--- a/mlir/lib/Transforms/LoopCoalescing.cpp
+++ b/mlir/lib/Transforms/LoopCoalescing.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
@@ -20,66 +21,70 @@
 
 namespace {
 struct LoopCoalescingPass : public LoopCoalescingBase<LoopCoalescingPass> {
-  void runOnFunction() override {
-    FuncOp func = getFunction();
 
-    func.walk([](scf::ForOp op) {
-      // Ignore nested loops.
-      if (op->getParentOfType<scf::ForOp>())
-        return;
+  /// Walk either an scf.for or an affine.for to find a band to coalesce.
+  template <typename LoopOpTy>
+  static void walkLoop(LoopOpTy op) {
+    // Ignore nested loops.
+    if (op->template getParentOfType<LoopOpTy>())
+      return;
 
-      SmallVector<scf::ForOp, 4> loops;
-      getPerfectlyNestedLoops(loops, op);
-      LLVM_DEBUG(llvm::dbgs()
-                 << "found a perfect nest of depth " << loops.size() << '\n');
+    SmallVector<LoopOpTy, 4> loops;
+    getPerfectlyNestedLoops(loops, op);
+    LLVM_DEBUG(llvm::dbgs()
+               << "found a perfect nest of depth " << loops.size() << '\n');
 
-      // Look for a band of loops that can be coalesced, i.e. perfectly nested
-      // loops with bounds defined above some loop.
-      // 1. For each loop, find above which parent loop its operands are
-      // defined.
-      SmallVector<unsigned, 4> operandsDefinedAbove(loops.size());
-      for (unsigned i = 0, e = loops.size(); i < e; ++i) {
-        operandsDefinedAbove[i] = i;
-        for (unsigned j = 0; j < i; ++j) {
-          if (areValuesDefinedAbove(loops[i].getOperands(),
-                                    loops[j].region())) {
-            operandsDefinedAbove[i] = j;
-            break;
-          }
+    // Look for a band of loops that can be coalesced, i.e. perfectly nested
+    // loops with bounds defined above some loop.
+    // 1. For each loop, find above which parent loop its operands are
+    // defined.
+    SmallVector<unsigned, 4> operandsDefinedAbove(loops.size());
+    for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+      operandsDefinedAbove[i] = i;
+      for (unsigned j = 0; j < i; ++j) {
+        if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].region())) {
+          operandsDefinedAbove[i] = j;
+          break;
         }
-        LLVM_DEBUG(llvm::dbgs()
-                   << "  bounds of loop " << i << " are known above depth "
-                   << operandsDefinedAbove[i] << '\n');
       }
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  bounds of loop " << i << " are known above depth "
+                 << operandsDefinedAbove[i] << '\n');
+    }
 
-      // 2. Identify bands of loops such that the operands of all of them are
-      // defined above the first loop in the band.  Traverse the nest bottom-up
-      // so that modifications don't invalidate the inner loops.
-      for (unsigned end = loops.size(); end > 0; --end) {
-        unsigned start = 0;
-        for (; start < end - 1; ++start) {
-          auto maxPos =
-              *std::max_element(std::next(operandsDefinedAbove.begin(), start),
-                                std::next(operandsDefinedAbove.begin(), end));
-          if (maxPos > start)
-            continue;
+    // 2. Identify bands of loops such that the operands of all of them are
+    // defined above the first loop in the band.  Traverse the nest bottom-up
+    // so that modifications don't invalidate the inner loops.
+    for (unsigned end = loops.size(); end > 0; --end) {
+      unsigned start = 0;
+      for (; start < end - 1; ++start) {
+        auto maxPos =
+            *std::max_element(std::next(operandsDefinedAbove.begin(), start),
+                              std::next(operandsDefinedAbove.begin(), end));
+        if (maxPos > start)
+          continue;
 
-          assert(maxPos == start &&
-                 "expected loop bounds to be known at the start of the band");
-          LLVM_DEBUG(llvm::dbgs() << "  found coalesceable band from " << start
-                                  << " to " << end << '\n');
+        assert(maxPos == start &&
+               "expected loop bounds to be known at the start of the band");
+        LLVM_DEBUG(llvm::dbgs() << "  found coalesceable band from " << start
+                                << " to " << end << '\n');
 
-          auto band =
-              llvm::makeMutableArrayRef(loops.data() + start, end - start);
-          coalesceLoops(band);
-          break;
-        }
-        // If a band was found and transformed, keep looking at the loops above
-        // the outermost transformed loop.
-        if (start != end - 1)
-          end = start + 1;
+        auto band =
+            llvm::makeMutableArrayRef(loops.data() + start, end - start);
+        (void)coalesceLoops(band);
+        break;
       }
-    });
+      // If a band was found and transformed, keep looking at the loops above
+      // the outermost transformed loop.
+      if (start != end - 1)
+        end = start + 1;
+    }
+  }
+
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+    func.walk([&](scf::ForOp op) { walkLoop(op); });
+    func.walk([&](AffineForOp op) { walkLoop(op); });
   }
 };
 
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -2045,7 +2045,7 @@
 
   builder.setInsertionPointToStart(outermost.getBody());
 
-  // 3. Remap induction variables.  For each original loop, the value of the
+  // 3. Remap induction variables. For each original loop, the value of the
   // induction variable can be obtained by dividing the induction variable of
   // the linearized loop by the total number of iterations of the loops nested
   // in it modulo the number of iterations in this loop (remove the values
@@ -2077,6 +2077,128 @@
   second.erase();
 }
 
+LogicalResult mlir::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
+  if (loops.size() < 2)
+    return success();
+
+  AffineForOp innermost = loops.back();
+  AffineForOp outermost = loops.front();
+  AffineBound ub = outermost.getUpperBound();
+  AffineMap origUbMap = ub.getMap();
+  Location loc = outermost.getLoc();
+  OpBuilder builder(outermost);
+  for (AffineForOp loop : loops) {
+    // We only work on normalized loops.
+    if (loop.getStep() != 1 || loop.getConstantLowerBound() != 0)
+      return failure();
+  }
+  SmallVector<Value, 4> upperBoundSymbols;
+  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
+                                   ub.getOperands().end());
+
+  // 1. Store the upper bound of the outermost loop in a variable.
+  Value prev;
+  if (!llvm::hasSingleElement(origUbMap.getResults()))
+    prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
+  else
+    prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
+  upperBoundSymbols.push_back(prev);
+
+  // 2. Emit code computing the upper bound of the coalesced loop as product of
+  // the number of iterations of all loops.
+  for (AffineForOp loop : loops.drop_front()) {
+    ub = loop.getUpperBound();
+    origUbMap = ub.getMap();
+    ubOperands = ub.getOperands();
+    Value upperBound;
+    // If upper bound map has more than one result, take their minimum.
+    if (!(origUbMap.getResults().empty() ||
+          llvm::hasSingleElement(origUbMap.getResults())))
+      upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
+    else
+      upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
+    upperBoundSymbols.push_back(upperBound);
+    SmallVector<Value, 4> operands;
+    operands.push_back(prev);
+    operands.push_back(upperBound);
+    // Maintain running product of loop upper bounds.
+    prev = builder.create<AffineApplyOp>(
+        loc,
+        AffineMap::get(/*numDims=*/
+                       1, /*numSymbols=*/1,
+                       builder.getAffineDimExpr(0) *
+                           builder.getAffineSymbolExpr(0)),
+        operands);
+  }
+  // Set upper bound of the coalesced loop.
+  AffineMap newUbMap = AffineMap::get(
+      /*numDims=*/0,
+      /*numSymbols=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
+  outermost.setUpperBound(prev, newUbMap);
+
+  builder.setInsertionPointToStart(outermost.getBody());
+
+  // 3. Remap induction variables. For each original loop, the value of the
+  // induction variable can be obtained by dividing the induction variable of
+  // the linearized loop by the total number of iterations of the loops nested
+  // in it modulo the number of iterations in this loop (remove the values
+  // related to the outer loops):
+  //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
+  // Compute these iteratively from the innermost loop by creating a "running
+  // quotient" of division by the range.
+  Value previous = outermost.getInductionVar();
+  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+    unsigned idx = loops.size() - i - 1;
+    if (i != 0) {
+      ub = loops[idx + 1].getUpperBound();
+      origUbMap = ub.getMap();
+      ubOperands = ub.getOperands();
+      SmallVector<Value, 4> operands;
+      operands.push_back(previous);
+      operands.push_back(upperBoundSymbols[idx + 1]);
+      previous = builder.create<AffineApplyOp>(
+          loc,
+          AffineMap::get(
+              /*numDims=*/1, /*numSymbols=*/1,
+              builder.getAffineDimExpr(0).floorDiv(
+                  builder.getAffineSymbolExpr(0))),
+          operands);
+    }
+    // Modified value of the induction variables of the nested loops after
+    // coalescing.
+    Value inductionVariable;
+    if (i == e - 1) {
+      inductionVariable = previous;
+    } else {
+      ub = loops[idx].getUpperBound();
+      origUbMap = ub.getMap();
+      ubOperands = ub.getOperands();
+      SmallVector<Value, 4> applyOperands;
+      applyOperands.push_back(previous);
+      applyOperands.push_back(upperBoundSymbols[idx]);
+      inductionVariable = builder.create<AffineApplyOp>(
+          loc,
+          AffineMap::get(
+              /*numDims=*/1, /*numSymbols=*/1,
+              builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
+          applyOperands);
+    }
+
+    replaceAllUsesInRegionWith(loops[idx].getInductionVar(), inductionVariable,
+                               loops.back().region());
+  }
+
+  // 4. Move the operations from the innermost just above the second-outermost
+  // loop, delete the extra terminator and the second-outermost loop.
+  AffineForOp second = loops[1];
+  innermost.getBody()->back().erase();
+  outermost.getBody()->getOperations().splice(
+      Block::iterator(second.getOperation()),
+      innermost.getBody()->getOperations());
+  second.erase();
+  return success();
+}
+
 void mlir::collapseParallelLoops(
     scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
   OpBuilder outsideBuilder(loops);
diff --git a/mlir/test/Transforms/loop-coalescing.mlir b/mlir/test/Transforms/loop-coalescing.mlir
--- a/mlir/test/Transforms/loop-coalescing.mlir
+++ b/mlir/test/Transforms/loop-coalescing.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect -loop-coalescing %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -loop-coalescing %s | FileCheck %s
 
 // CHECK-LABEL: @one_3d_nest
 func @one_3d_nest() {
@@ -191,3 +191,147 @@
   }
   return
 }
+
+// -----
+
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<() -> (16)>
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<() -> (64)>
+// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[MAP3:.*]] = affine_map<() -> (8)>
+// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+// CHECK-DAG: #[[MAP5:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
+func @coalesce_affine_for() {
+  affine.for %i = 0 to 16 {
+    affine.for %j = 0 to 64 {
+      affine.for %k = 0 to 8 {
+        "test.foo"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+  return
+}
+// CHECK-DAG: %[[T0:.*]] = affine.apply #[[MAP0]]()
+// CHECK-DAG: %[[T1:.*]] = affine.apply #[[MAP1]]()
+// CHECK-DAG: %[[T2:.*]] = affine.apply #[[MAP2]](%[[T0]])[%[[T1]]]
+// CHECK-DAG: %[[T3:.*]] = affine.apply #[[MAP3]]()
+// CHECK-DAG: %[[T4:.*]] = affine.apply #[[MAP2]](%[[T2]])[%[[T3]]]
+// CHECK:       affine.for %[[IV:.*]] = 0 to %[[T4]]
+// CHECK-DAG:    %[[K:.*]] =  affine.apply #[[MAP4]](%[[IV]])[%[[T3]]]
+// CHECK-DAG:    %[[T6:.*]] = affine.apply #[[MAP5]](%[[IV]])[%[[T3]]]
+// CHECK-DAG:    %[[J:.*]] =  affine.apply #[[MAP4]](%[[T6]])[%[[T1]]]
+// CHECK-DAG:    %[[I:.*]] =  affine.apply #[[MAP5]](%[[T6]])[%[[T1]]]
+// CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+// -----
+
+// CHECK-DAG: #[[MAP:.*]] = affine_map<()[s0] -> (s0)>
+// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+// CHECK-DAG: #[[FLOOR:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
+func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
+  %c0 = constant 0 : index
+  %M = memref.dim %arg0, %c0 : memref<?x?xf32>
+  %N = memref.dim %arg0, %c0 : memref<?x?xf32>
+  %K = memref.dim %arg0, %c0 : memref<?x?xf32>
+  affine.for %i = 0 to %M {
+    affine.for %j = 0 to %N {
+      affine.for %k = 0 to %K {
+      "test.foo"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+  return
+}
+// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK-DAG: %[[T3:.*]] = affine.apply #[[MAP]]()[%[[T0]]]
+// CHECK-DAG: %[[T4:.*]] = affine.apply #[[MAP]]()[%[[T1]]]
+// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]]
+// CHECK-DAG: %[[T6:.*]] = affine.apply #[[MAP]]()[%[[T2]]]
+// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]]
+// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]]
+// CHECK-DAG:    %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T6]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T4]]]
+// CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+// -----
+
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0)>
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[MAP2:.*]] = affine_map<() -> (64)>
+// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
+func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
+  %c0 = constant 0 : index
+  %M = memref.dim %arg0, %c0 : memref<?x?xf32>
+  %N = memref.dim %arg0, %c0 : memref<?x?xf32>
+  affine.for %i = 0 to %M {
+    affine.for %j = 0 to %N {
+      affine.for %k = 0 to 64 {
+      "test.foo"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+  return
+}
+// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK-DAG: %[[T2:.*]] = affine.apply #[[MAP0]]()[%[[T0]]]
+// CHECK-DAG: %[[T3:.*]] = affine.apply #[[MAP0]]()[%[[T1]]]
+// CHECK-DAG: %[[T4:.*]] = affine.apply #[[MAP1]](%[[T2]])[%[[T3]]]
+// CHECK-DAG: %[[T5:.*]] = affine.apply #[[MAP2]]()
+// CHECK-DAG: %[[T6:.*]] = affine.apply #[[MAP1]](%[[T4]])[%[[T5]]]
+// CHECK: affine.for %[[IV:.*]] = 0 to %[[T6]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MAP3]](%[[IV]])[%[[T5]]]
+// CHECK-DAG:    %[[T8:.*]] = affine.apply #[[MAP4]](%[[IV]])[%[[T5]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MAP3]](%[[T8]])[%[[T3]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply #[[MAP4]](%[[T8]])[%[[T3]]]
+// CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+// -----
+
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0, -s0)>
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0)>
+// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
+#myMap = affine_map<()[s1] -> (s1, -s1)>
+func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
+ %c0 = constant 0 : index
+ %M = memref.dim %arg0, %c0 : memref<?x?xf32>
+ %N = memref.dim %arg0, %c0 : memref<?x?xf32>
+ %K = memref.dim %arg0, %c0 : memref<?x?xf32>
+ affine.for %i = 0 to min #myMap()[%M] {
+   affine.for %j = 0 to %N {
+     affine.for %k = 0 to %K {
+     "test.foo"(%i, %j, %k) : (index, index, index) -> ()
+     }
+   }
+ }
+ return
+}
+// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK-DAG: %[[T3:.*]] = affine.min #[[MAP0]]()[%[[T0]]]
+// CHECK-DAG: %[[T4:.*]] = affine.apply #[[MAP1]]()[%[[T1]]]
+// CHECK-DAG: %[[T5:.*]] = affine.apply #[[MAP2]](%[[T3]])[%[[T4]]]
+// CHECK-DAG: %[[T6:.*]] = affine.apply #[[MAP1]]()[%[[T2]]]
+// CHECK-DAG: %[[T7:.*]] = affine.apply #[[MAP2]](%[[T5]])[%[[T6]]]
+// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MAP3]](%[[IV]])[%[[T6]]]
+// CHECK-DAG:    %[[T9:.*]] = affine.apply #[[MAP4]](%[[IV]])[%[[T6]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MAP3]](%[[T9]])[%[[T4]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply #[[MAP4]](%[[T9]])[%[[T4]]]
+// CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return