diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -79,13 +79,14 @@
                               AffineMap *map, SmallVectorImpl<Value> *operands,
                               OpBuilder &builder);
 
-/// Skew the operations in the body of a 'affine.for' operation with the
+/// Skew the operations in the body of an affine.for operation with the
 /// specified operation-wise shifts. The shifts are with respect to the
 /// original execution order, and are multiplied by the loop 'step' before being
-/// applied.
+/// applied. If `unrollPrologueEpilogue` is set, fully unroll the prologue and
+/// epilogue loops when possible.
 LLVM_NODISCARD
-LogicalResult instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
-                           bool unrollPrologueEpilogue = false);
+LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
+                                  bool unrollPrologueEpilogue = false);
 
 /// Tiles the specified band of perfectly nested loops creating tile-space loops
 /// and intra-tile loops. A band is a contiguous set of loops.
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp
--- a/mlir/lib/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp
@@ -22,6 +22,7 @@
 #include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Debug.h"
+
 #define DEBUG_TYPE "affine-pipeline-data-transfer"
 
 using namespace mlir;
@@ -46,9 +47,9 @@
 // Returns the position of the tag memref operand given a DMA operation.
 // Temporary utility: will be replaced when DmaStart/DmaFinish abstract op's are
 // added.  TODO(b/117228571)
-static unsigned getTagMemRefPos(Operation &dmaInst) {
-  assert(isa<AffineDmaStartOp>(dmaInst) || isa<AffineDmaWaitOp>(dmaInst));
-  if (auto dmaStartOp = dyn_cast<AffineDmaStartOp>(dmaInst)) {
+static unsigned getTagMemRefPos(Operation &dmaOp) {
+  assert(isa<AffineDmaStartOp>(dmaOp) || isa<AffineDmaWaitOp>(dmaOp));
+  if (auto dmaStartOp = dyn_cast<AffineDmaStartOp>(dmaOp)) {
     return dmaStartOp.getTagMemRefOperandIndex();
   }
   // First operand for a dma finish operation.
@@ -79,21 +80,20 @@
   auto oldMemRefType = oldMemRef.getType().cast<MemRefType>();
   auto newMemRefType = doubleShape(oldMemRefType);
 
-  // The double buffer is allocated right before 'forInst'.
-  auto *forInst = forOp.getOperation();
-  OpBuilder bOuter(forInst);
+  // The double buffer is allocated right before 'forOp'.
+  OpBuilder bOuter(forOp);
   // Put together alloc operands for any dynamic dimensions of the memref.
   SmallVector<Value, 4> allocOperands;
   unsigned dynamicDimCount = 0;
   for (auto dimSize : oldMemRefType.getShape()) {
     if (dimSize == -1)
-      allocOperands.push_back(bOuter.create<DimOp>(forInst->getLoc(), oldMemRef,
-                                                   dynamicDimCount++));
+      allocOperands.push_back(
+          bOuter.create<DimOp>(forOp.getLoc(), oldMemRef, dynamicDimCount++));
   }
 
   // Create and place the alloc right before the 'affine.for' operation.
   Value newMemRef =
-      bOuter.create<AllocOp>(forInst->getLoc(), newMemRefType, allocOperands);
+      bOuter.create<AllocOp>(forOp.getLoc(), newMemRefType, allocOperands);
 
   // Create 'iv mod 2' value to index the leading dimension.
   auto d0 = bInner.getAffineDimExpr(0);
@@ -118,8 +118,8 @@
     return false;
   }
   // Insert the dealloc op right after the for loop.
-  bOuter.setInsertionPointAfter(forInst);
-  bOuter.create<DeallocOp>(forInst->getLoc(), newMemRef);
+  bOuter.setInsertionPointAfter(forOp);
+  bOuter.create<DeallocOp>(forOp.getLoc(), newMemRef);
 
   return true;
 }
@@ -219,11 +219,11 @@
   }
 
   // For each start operation, we look for a matching finish operation.
-  for (auto *dmaStartInst : dmaStartInsts) {
-    for (auto *dmaFinishInst : dmaFinishInsts) {
-      if (checkTagMatch(cast<AffineDmaStartOp>(dmaStartInst),
-                        cast<AffineDmaWaitOp>(dmaFinishInst))) {
-        startWaitPairs.push_back({dmaStartInst, dmaFinishInst});
+  for (auto *dmaStartOp : dmaStartInsts) {
+    for (auto *dmaFinishOp : dmaFinishInsts) {
+      if (checkTagMatch(cast<AffineDmaStartOp>(dmaStartOp),
+                        cast<AffineDmaWaitOp>(dmaFinishOp))) {
+        startWaitPairs.push_back({dmaStartOp, dmaFinishOp});
         break;
       }
     }
@@ -236,8 +236,7 @@
 void PipelineDataTransfer::runOnAffineForOp(AffineForOp forOp) {
   auto mayBeConstTripCount = getConstantTripCount(forOp);
   if (!mayBeConstTripCount.hasValue()) {
-    LLVM_DEBUG(
-        forOp.emitRemark("won't pipeline due to unknown trip count loop"));
+    LLVM_DEBUG(forOp.emitRemark("won't pipeline due to unknown trip count"));
     return;
   }
 
@@ -258,14 +257,14 @@
   // the dimension we are adding here for the double buffering is the outermost
   // dimension.
   for (auto &pair : startWaitPairs) {
-    auto *dmaStartInst = pair.first;
-    Value oldMemRef = dmaStartInst->getOperand(
-        cast<AffineDmaStartOp>(dmaStartInst).getFasterMemPos());
+    auto *dmaStartOp = pair.first;
+    Value oldMemRef = dmaStartOp->getOperand(
+        cast<AffineDmaStartOp>(dmaStartOp).getFasterMemPos());
     if (!doubleBuffer(oldMemRef, forOp)) {
       // Normally, double buffering should not fail because we already checked
       // that there are no uses outside.
       LLVM_DEBUG(llvm::dbgs()
-                     << "double buffering failed for" << dmaStartInst << "\n";);
+                     << "double buffering failed for" << dmaStartOp << "\n";);
       // IR still valid and semantically correct.
       return;
     }
@@ -275,13 +274,13 @@
     // order to create the double buffer above.)
     // '-canonicalize' does this in a more general way, but we'll anyway do the
     // simple/common case so that the output / test cases looks clear.
-    if (auto *allocInst = oldMemRef.getDefiningOp()) {
+    if (auto *allocOp = oldMemRef.getDefiningOp()) {
       if (oldMemRef.use_empty()) {
-        allocInst->erase();
+        allocOp->erase();
       } else if (oldMemRef.hasOneUse()) {
         if (auto dealloc = dyn_cast<DeallocOp>(*oldMemRef.user_begin())) {
           dealloc.erase();
-          allocInst->erase();
+          allocOp->erase();
         }
       }
     }
@@ -289,22 +288,21 @@
 
   // Double the buffers for tag memrefs.
   for (auto &pair : startWaitPairs) {
-    auto *dmaFinishInst = pair.second;
-    Value oldTagMemRef =
-        dmaFinishInst->getOperand(getTagMemRefPos(*dmaFinishInst));
+    auto *dmaFinishOp = pair.second;
+    Value oldTagMemRef = dmaFinishOp->getOperand(getTagMemRefPos(*dmaFinishOp));
     if (!doubleBuffer(oldTagMemRef, forOp)) {
       LLVM_DEBUG(llvm::dbgs() << "tag double buffering failed\n";);
       return;
     }
     // If the old tag has no uses or a single dealloc use, remove it.
     // (canonicalization handles more complex cases).
-    if (auto *tagAllocInst = oldTagMemRef.getDefiningOp()) {
+    if (auto *tagAllocOp = oldTagMemRef.getDefiningOp()) {
       if (oldTagMemRef.use_empty()) {
-        tagAllocInst->erase();
+        tagAllocOp->erase();
       } else if (oldTagMemRef.hasOneUse()) {
         if (auto dealloc = dyn_cast<DeallocOp>(*oldTagMemRef.user_begin())) {
           dealloc.erase();
-          tagAllocInst->erase();
+          tagAllocOp->erase();
         }
       }
     }
@@ -317,12 +315,12 @@
   // Store shift for operation for later lookup for AffineApplyOp's.
   DenseMap<Operation *, unsigned> instShiftMap;
   for (auto &pair : startWaitPairs) {
-    auto *dmaStartInst = pair.first;
-    assert(isa<AffineDmaStartOp>(dmaStartInst));
-    instShiftMap[dmaStartInst] = 0;
+    auto *dmaStartOp = pair.first;
+    assert(isa<AffineDmaStartOp>(dmaStartOp));
+    instShiftMap[dmaStartOp] = 0;
     // Set shifts for DMA start op's affine operand computation slices to 0.
     SmallVector<AffineApplyOp, 4> sliceOps;
-    mlir::createAffineComputationSlice(dmaStartInst, &sliceOps);
+    mlir::createAffineComputationSlice(dmaStartOp, &sliceOps);
     if (!sliceOps.empty()) {
       for (auto sliceOp : sliceOps) {
         instShiftMap[sliceOp.getOperation()] = 0;
@@ -331,7 +329,7 @@
       // If a slice wasn't created, the reachable affine.apply op's from its
       // operands are the ones that go with it.
       SmallVector<Operation *, 4> affineApplyInsts;
-      SmallVector<Value, 4> operands(dmaStartInst->getOperands());
+      SmallVector<Value, 4> operands(dmaStartOp->getOperands());
       getReachableAffineApplyOps(operands, affineApplyInsts);
       for (auto *op : affineApplyInsts) {
         instShiftMap[op] = 0;
@@ -339,16 +337,14 @@
     }
   }
   // Everything else (including compute ops and dma finish) are shifted by one.
-  for (auto &op : *forOp.getBody()) {
-    if (instShiftMap.find(&op) == instShiftMap.end()) {
+  for (auto &op : forOp.getBody()->without_terminator())
+    if (instShiftMap.find(&op) == instShiftMap.end())
       instShiftMap[&op] = 1;
-    }
-  }
 
   // Get shifts stored in map.
   std::vector<uint64_t> shifts(forOp.getBody()->getOperations().size());
   unsigned s = 0;
-  for (auto &op : *forOp.getBody()) {
+  for (auto &op : forOp.getBody()->without_terminator()) {
     assert(instShiftMap.find(&op) != instShiftMap.end());
     shifts[s++] = instShiftMap[&op];
 
@@ -365,7 +361,7 @@
     return;
   }
 
-  if (failed(instBodySkew(forOp, shifts))) {
+  if (failed(affineForOpBodySkew(forOp, shifts))) {
     LLVM_DEBUG(llvm::dbgs() << "op body skewing failed - unexpected\n";);
     return;
   }
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -156,65 +156,57 @@
   f.walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
 }
 
-/// Generates a 'affine.for' op with the specified lower and upper bounds
-/// while generating the right IV remappings for the shifted operations. The
-/// operation blocks that go into the loop are specified in instGroupQueue
-/// starting from the specified offset, and in that order; the first element of
-/// the pair specifies the shift applied to that group of operations; note
-/// that the shift is multiplied by the loop step before being applied. Returns
-/// nullptr if the generated loop simplifies to a single iteration one.
-static AffineForOp
-generateLoop(AffineMap lbMap, AffineMap ubMap,
-             const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>>
-                 &instGroupQueue,
-             unsigned offset, AffineForOp srcForInst, OpBuilder b) {
-  auto lbOperands = srcForInst.getLowerBoundOperands();
-  auto ubOperands = srcForInst.getUpperBoundOperands();
+/// Generates an affine.for op with the specified lower and upper bounds
+/// while generating the right IV remappings to realize shifts for operations in
+/// its body. The operations that go into the loop body are specified in
+/// opGroupQueue starting from the specified offset, and in that order. The
+/// first element of the pair specifies the shift applied to that group of
+/// operations; the shift is multiplied by the loop step before being applied.
+/// Returns nullptr if the generated loop simplifies to a single iteration one.
+static AffineForOp generateShiftedLoop(
+    AffineMap lbMap, AffineMap ubMap,
+    const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
+    unsigned offset, AffineForOp srcForOp, OpBuilder b) {
+  auto lbOperands = srcForOp.getLowerBoundOperands();
+  auto ubOperands = srcForOp.getUpperBoundOperands();
 
   assert(lbMap.getNumInputs() == lbOperands.size());
   assert(ubMap.getNumInputs() == ubOperands.size());
 
-  auto loopChunk =
-      b.create<AffineForOp>(srcForInst.getLoc(), lbOperands, lbMap, ubOperands,
-                            ubMap, srcForInst.getStep());
+  auto loopChunk = b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap,
+                                         ubOperands, ubMap, srcForOp.getStep());
   auto loopChunkIV = loopChunk.getInductionVar();
-  auto srcIV = srcForInst.getInductionVar();
+  auto srcIV = srcForOp.getInductionVar();
 
   BlockAndValueMapping operandMap;
 
   OpBuilder bodyBuilder = loopChunk.getBodyBuilder();
-  for (auto it = instGroupQueue.begin() + offset, e = instGroupQueue.end();
-       it != e; ++it) {
+  for (auto it = opGroupQueue.begin() + offset, e = opGroupQueue.end(); it != e;
+       ++it) {
     uint64_t shift = it->first;
-    auto insts = it->second;
+    auto ops = it->second;
     // All 'same shift' operations get added with their operands being
     // remapped to results of cloned operations, and their IV used remapped.
     // Generate the remapping if the shift is not zero: remappedIV = newIV -
     // shift.
     if (!srcIV.use_empty() && shift != 0) {
       auto ivRemap = bodyBuilder.create<AffineApplyOp>(
-          srcForInst.getLoc(),
+          srcForOp.getLoc(),
           bodyBuilder.getSingleDimShiftAffineMap(
-              -static_cast<int64_t>(srcForInst.getStep() * shift)),
+              -static_cast<int64_t>(srcForOp.getStep() * shift)),
           loopChunkIV);
       operandMap.map(srcIV, ivRemap);
     } else {
       operandMap.map(srcIV, loopChunkIV);
     }
-    for (auto *op : insts) {
-      if (!isa<AffineTerminatorOp>(op))
-        bodyBuilder.clone(*op, operandMap);
-    }
+    for (auto *op : ops)
+      bodyBuilder.clone(*op, operandMap);
   };
   if (succeeded(promoteIfSingleIteration(loopChunk)))
     return AffineForOp();
   return loopChunk;
 }
 
-/// Skew the operations in the body of a 'affine.for' operation with the
-/// specified operation-wise shifts. The shifts are with respect to the
-/// original execution order, and are multiplied by the loop 'step' before being
-/// applied. A shift of zero for each operation will lead to no change.
 // The skewing of operations with respect to one another can be used for
 // example to allow overlap of asynchronous operations (such as DMA
 // communication) with computation, or just relative shifting of operations
@@ -226,8 +218,9 @@
 // asserts preservation of SSA dominance. A check for that as well as that for
 // memory-based dependence preservation check rests with the users of this
 // method.
-LogicalResult mlir::instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
-                                 bool unrollPrologueEpilogue) {
+LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp,
+                                        ArrayRef<uint64_t> shifts,
+                                        bool unrollPrologueEpilogue) {
   if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
     return success();
 
@@ -263,11 +256,11 @@
   // An array of operation groups sorted by shift amount; each group has all
   // operations with the same shift in the order in which they appear in the
   // body of the 'affine.for' op.
-  std::vector<std::vector<Operation *>> sortedInstGroups(maxShift + 1);
+  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
   unsigned pos = 0;
-  for (auto &op : *forOp.getBody()) {
+  for (auto &op : forOp.getBody()->without_terminator()) {
     auto shift = shifts[pos++];
-    sortedInstGroups[shift].push_back(&op);
+    sortedOpGroups[shift].push_back(&op);
   }
 
   // Unless the shifts have a specific pattern (which actually would be the
@@ -275,40 +268,39 @@
   // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
   // loop generated as the prologue and the last as epilogue and unroll these
   // fully.
-  AffineForOp prologue;
-  AffineForOp epilogue;
+  AffineForOp prologue, epilogue;
 
   // Do a sweep over the sorted shifts while storing open groups in a
   // vector, and generating loop portions as necessary during the sweep. A block
   // of operations is paired with its shift.
-  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> instGroupQueue;
+  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
 
   auto origLbMap = forOp.getLowerBoundMap();
   uint64_t lbShift = 0;
   OpBuilder b(forOp.getOperation());
-  for (uint64_t d = 0, e = sortedInstGroups.size(); d < e; ++d) {
+  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
     // If nothing is shifted by d, continue.
-    if (sortedInstGroups[d].empty())
+    if (sortedOpGroups[d].empty())
       continue;
-    if (!instGroupQueue.empty()) {
+    if (!opGroupQueue.empty()) {
       assert(d >= 1 &&
              "Queue expected to be empty when the first block is found");
       // The interval for which the loop needs to be generated here is:
       // [lbShift, min(lbShift + tripCount, d)) and the body of the
-      // loop needs to have all operations in instQueue in that order.
+      // loop needs to have all operations in opQueue in that order.
       AffineForOp res;
       if (lbShift + tripCount * step < d * step) {
-        res = generateLoop(
+        res = generateShiftedLoop(
             b.getShiftedAffineMap(origLbMap, lbShift),
             b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
-            instGroupQueue, 0, forOp, b);
+            opGroupQueue, /*offset=*/0, forOp, b);
         // Entire loop for the queued op groups generated, empty it.
-        instGroupQueue.clear();
+        opGroupQueue.clear();
         lbShift += tripCount * step;
       } else {
-        res = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift),
-                           b.getShiftedAffineMap(origLbMap, d), instGroupQueue,
-                           0, forOp, b);
+        res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
+                                  b.getShiftedAffineMap(origLbMap, d),
+                                  opGroupQueue, /*offset=*/0, forOp, b);
         lbShift = d * step;
       }
       if (!prologue && res)
@@ -319,16 +311,16 @@
       lbShift = d * step;
     }
     // Augment the list of operations that get into the current open interval.
-    instGroupQueue.push_back({d, sortedInstGroups[d]});
+    opGroupQueue.push_back({d, sortedOpGroups[d]});
   }
 
   // Those operations groups left in the queue now need to be processed (FIFO)
   // and their loops completed.
-  for (unsigned i = 0, e = instGroupQueue.size(); i < e; ++i) {
-    uint64_t ubShift = (instGroupQueue[i].first + tripCount) * step;
-    epilogue = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift),
-                            b.getShiftedAffineMap(origLbMap, ubShift),
-                            instGroupQueue, i, forOp, b);
+  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
+    uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
+    epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
+                                   b.getShiftedAffineMap(origLbMap, ubShift),
+                                   opGroupQueue, /*offset=*/i, forOp, b);
     lbShift = ubShift;
     if (!prologue)
       prologue = epilogue;
diff --git a/mlir/test/Transforms/pipeline-data-transfer.mlir b/mlir/test/Transforms/pipeline-data-transfer.mlir
--- a/mlir/test/Transforms/pipeline-data-transfer.mlir
+++ b/mlir/test/Transforms/pipeline-data-transfer.mlir
@@ -36,23 +36,23 @@
 // CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
 // CHECK-NEXT:  affine.for %{{.*}} = 1 to 8 {
 // CHECK-NEXT:    affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
-// CHECK-NEXT:    %{{.*}} = affine.apply [[MAP_MINUS_1]](%{{.*}})
-// CHECK-NEXT:    %{{.*}} = affine.apply [[MOD_2]](%{{.*}})
-// CHECK-NEXT:    %{{.*}} = affine.apply [[MOD_2]](%{{.*}})
+// CHECK-NEXT:    affine.apply [[MAP_MINUS_1]](%{{.*}})
+// CHECK-NEXT:    affine.apply [[MOD_2]](%{{.*}})
+// CHECK-NEXT:    affine.apply [[MOD_2]](%{{.*}})
 // CHECK-NEXT:    affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32>
-// CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
-// CHECK-NEXT:    %{{.*}} = "compute"(%{{.*}}) : (f32) -> f32
+// CHECK-NEXT:    affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
+// CHECK-NEXT:    "compute"(%{{.*}}) : (f32) -> f32
 // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
 // CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 {
 // CHECK-NEXT:      "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> ()
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
-// CHECK-NEXT:  %{{.*}} = affine.apply [[MAP_MINUS_1]](%{{.*}})
-// CHECK-NEXT:  %{{.*}} = affine.apply [[MOD_2]](%{{.*}})
-// CHECK-NEXT:  %{{.*}} = affine.apply [[MOD_2]](%{{.*}})
+// CHECK-NEXT:  affine.apply [[MAP_MINUS_1]](%{{.*}})
+// CHECK-NEXT:  affine.apply [[MOD_2]](%{{.*}})
+// CHECK-NEXT:  affine.apply [[MOD_2]](%{{.*}})
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32>
-// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
-// CHECK-NEXT:  %{{.*}} = "compute"(%{{.*}}) : (f32) -> f32
+// CHECK-NEXT:  affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
+// CHECK-NEXT:  "compute"(%{{.*}}) : (f32) -> f32
 // CHECK-NEXT:  affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
 // CHECK-NEXT:  affine.for %{{.*}} = 0 to 32 {
 // CHECK-NEXT:    "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> ()
@@ -89,8 +89,8 @@
 // CHECK-NEXT:   affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
 // CHECK-NEXT:   affine.for %{{.*}} = 4 to 512 step 4 {
 // CHECK-NEXT:     affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
-// CHECK-NEXT:     %{{.*}} = affine.apply [[REMAP_SHIFT_MINUS_4]](%{{.*}})
-// CHECK-NEXT:     %{{.*}} = affine.apply [[FLOOR_MOD_2]](%{{.*}})
+// CHECK-NEXT:     affine.apply [[REMAP_SHIFT_MINUS_4]](%{{.*}})
+// CHECK-NEXT:     affine.apply [[FLOOR_MOD_2]](%{{.*}})
 // CHECK:          affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32>
 // CHECK-NEXT:     "compute"(%{{.*}}) : (index) -> ()
 // CHECK-NEXT:   }
@@ -313,7 +313,7 @@
   dealloc %tag : memref<1 x i32>
   dealloc %Av : memref<32 x 32 x f32, 2>
   return %v : f32
-// CHECK:      %{{[0-9]+}} = affine.load %{{[0-9]+}}[%{{.*}}, %{{.*}}] : memref<32x32xf32, 2>
+// CHECK:      affine.load %{{[0-9]+}}[%{{.*}}, %{{.*}}] : memref<32x32xf32, 2>
 // CHECK:      return
 }
 
@@ -329,10 +329,10 @@
   %tag = alloc() : memref<1 x i32>
 
 // Double buffering for dynamic shaped buffer.
-// CHECK:       %{{.*}} = alloc(%{{.*}}, %{{.*}}) : memref<?x?xf32, 2>
-// CHECK-NEXT:  %{{.*}} = dim %{{.*}}, 0 : memref<?x?xf32, 2>
-// CHECK-NEXT:  %{{.*}} = dim %{{.*}}, 1 : memref<?x?xf32, 2>
-// CHECK-NEXT:  %{{.*}} = alloc(%{{.*}}, %{{.*}}) : memref<2x?x?xf32, 2>
+// CHECK:       alloc(%{{.*}}, %{{.*}}) : memref<?x?xf32, 2>
+// CHECK-NEXT:  dim %{{.*}}, 0 : memref<?x?xf32, 2>
+// CHECK-NEXT:  dim %{{.*}}, 1 : memref<?x?xf32, 2>
+// CHECK-NEXT:  alloc(%{{.*}}, %{{.*}}) : memref<2x?x?xf32, 2>
 // CHECK:       affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}}
   affine.for %kTT = 0 to 16 {
     affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :