diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -230,6 +230,10 @@
     return tensor(b) == outTensor && index(b) == i;
   }
 
+  unsigned getOutTensorID() const { return outTensor; }
+
+  unsigned getSynTensorID() const { return outTensor + 1; }
+
   /// Returns true if given tensor iterates *only* in the given tensor
   /// expression. For the output tensor, this defines a "simply dynamic"
   /// operation [Bik96]. For instance: a(i) *= 2.0 or a(i) += a(i) for
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -34,127 +34,6 @@
 /// `createFuncCall()`, and `replaceOpWithFuncCall()`.
 enum class EmitCInterface : bool { Off = false, On = true };
 
-//===----------------------------------------------------------------------===//
-// SparseTensorLoopEmiter class, manages sparse tensors and helps to generate
-// loop structure to (co-iterate) sparse tensors.
-//
-// An example usage:
-// To generate following loops over T1<?x?> and T2<?x?>
-//
-// for i in T1[0] {
-//   for j : T2[0] {
-//     for k : T1[1] {}
-//     for k : T2[1] {}
-//   }
-// }
-//
-// One can use
-//
-// SparseTensorLoopEmiter loopEmiter({T1, T1});
-// loopEmiter.initializeLoopEmit();
-// loopEmiter.enterLoopOverTensorAtDim(T1, 0);
-// loopEmiter.enterLoopOverTensorAtDim(T2, 0);
-// loopEmiter.enterLoopOverTensorAtDim(T1, 1);
-// loopEmiter.exitCurrentLoop();
-// loopEmiter.enterLoopOverTensorAtDim(T2, 1);
-// for 0 -> 3:
-//    loopEmiter.exitCurrentLoop();
-//===----------------------------------------------------------------------===//
-
-// TODO: Sparsification should also rely on this class to generate loops.
-class SparseTensorLoopEmitter {
-public:
-  /// Constructor: take an array of tensors inputs, on which the generated loops
-  /// will iterate on. The index of the tensor in the array is also the
-  /// tensor id (tid) used in related functions.
-  explicit SparseTensorLoopEmitter(ValueRange tensors,
-                                   bool isLastOutput = false);
-
-  ///
-  /// Core functions.
-  ///
-
-  /// Starts a loop emitting session:
-  /// 1. Generates all the buffers needed to iterate tensors.
-  /// 2. Generates the lo/hi bounds to iterate tensors[0].
-  void initializeLoopEmit(OpBuilder &builder, Location loc);
-
-  // TODO: Gets rid of `dim` in the argument list? Track the dimension we
-  // are currently at internally. Then it would be enterNextDimForTensor.
-
-  /// Emits loop over tensor[dim], it assumes that loops between
-  /// tensor[0...dim - 1] have already been generated.
-  /// It also prepares to enter tensor[dim + 1].
-  Operation *enterLoopOverTensorAtDim(OpBuilder &builder, Location loc,
-                                      size_t tid, size_t dim,
-                                      ArrayRef<Value> reduc = {});
-
-  /// Emits a coiteration loop over a set of tensors.
-  // TODO: not yet implemented
-  void enterCoiterationOverTensorsAtDims(OpBuilder &builder, Location loc,
-                                         ArrayRef<size_t> ts,
-                                         ArrayRef<size_t> ds);
-
-  /// Emits extra locals, since the locals might not be in simplified lattices
-  /// point used to generate the loops, but are still required to generates
-  /// expressions.
-  Value emitExtraLocalsForTensorsAtDims(OpBuilder &builder, Location loc,
-                                        size_t tid, size_t dim);
-
-  void exitCurrentLoop();
-
-  /// Return the array of coordinate for all the loop generated till now.
-  void getCoordinateArray(SmallVectorImpl<Value> &coords) {
-    for (auto &l : loopStack)
-      coords.push_back(l.idx);
-  }
-
-  ///
-  /// Getters.
-  ///
-
-  Value getTensorValueBuffer(size_t tid) { return valBuffer[tid]; }
-  Value getLastLevelTensorPointerIndex(size_t tid) {
-    return pidxs[tid].back();
-  };
-
-private:
-  struct LoopLevelInfo {
-    LoopLevelInfo(ArrayRef<size_t> ts, ArrayRef<size_t> ds, Value idx)
-        : tensors(ts), dims(ds), idx(idx) {}
-    llvm::SmallVector<size_t, 4> tensors;
-    llvm::SmallVector<size_t, 4> dims;
-    Value idx;
-  };
-
-  /// Return false if tid[dim] is a dense dimension that does not need to be
-  /// prepared (to be used by sparsification for needUniv).
-  bool prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid,
-                                  size_t dim);
-
-  /// Input (TODO: and output) tensors.
-  std::vector<Value> tensors;
-  /// The dim type array for each tensor.
-  std::vector<std::vector<DimLevelType>> dims;
-  /// Sparse iteration information (by tensor and dim). These arrays
-  /// are updated to remain current within the current loop.
-  std::vector<std::vector<Value>> pidxs;
-  std::vector<std::vector<Value>> coord;
-  std::vector<std::vector<Value>> highs;
-  /// Universal dense indices and upper bounds (by index). The sizes array is
-  /// set once with the inferred dimension sizes.
-  std::vector<std::vector<Value>> sizes;
-  std::vector<std::vector<Value>> ptrBuffer; // to_pointers
-  std::vector<std::vector<Value>> idxBuffer; // to_indices
-  std::vector<Value> valBuffer;              // to_value
-
-  bool isLastOutput; // Is the last tensor output tensor
-  std::vector<LoopLevelInfo> loopStack;
-  // TODO: not yet used, it should track the current level for each tensor
-  // to help eliminate `dim` paramters from above APIs.
-  std::vector<size_t> curLv;
-};
-
 //===----------------------------------------------------------------------===//
 // ExecutionEngine/SparseTensorUtils helper functions.
 //===----------------------------------------------------------------------===//
@@ -400,6 +279,234 @@
   return constantI8(builder, loc, static_cast<uint8_t>(dlt));
 }
 
+/// Computes the shape of destination tensor of a reshape operator. This is only
+/// used when operands have dynamic shape. The shape of the destination is
+/// stored into dstShape.
+void genReshapeDstShape(Location loc, PatternRewriter &rewriter,
+                        SmallVector<Value, 4> &dstShape,
+                        ArrayRef<Value> srcShape,
+                        ArrayRef<int64_t> staticDstShape,
+                        ArrayRef<ReassociationIndices> reassociation);
+
+/// Helper method to translate indices during a reshaping operation.
+void translateIndicesArray(OpBuilder &builder, Location loc,
+                           ArrayRef<ReassociationIndices> reassociation,
+                           ValueRange srcIndices, ArrayRef<Value> srcShape,
+                           ArrayRef<Value> dstShape,
+                           SmallVectorImpl<Value> &dstIndices);
+
+inline bool isZeroRankedTensorOrScalar(Type type) {
+  auto rtp = type.dyn_cast<RankedTensorType>();
+  return !rtp || rtp.getRank() == 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SparseTensorLoopEmiter class, manages sparse tensors and helps to generate
+// loop structure to (co-iterate) sparse tensors.
+//
+// An example usage:
+// To generate following loops over T1<?x?> and T2<?x?>
+//
+// for i in T1[0] {
+//   for j : T2[0] {
+//     for k : T1[1] {}
+//     for k : T2[1] {}
+//   }
+// }
+//
+// One can use
+//
+// SparseTensorLoopEmiter loopEmiter({T1, T1});
+// loopEmiter.initializeLoopEmit();
+// loopEmiter.enterLoopOverTensorAtDim(T1, 0);
+// loopEmiter.enterLoopOverTensorAtDim(T2, 0);
+// loopEmiter.enterLoopOverTensorAtDim(T1, 1);
+// loopEmiter.exitCurrentLoop();
+// loopEmiter.enterLoopOverTensorAtDim(T2, 1);
+// for 0 -> 3:
+//    loopEmiter.exitCurrentLoop();
+//===----------------------------------------------------------------------===//
+
+// TODO: Sparsification should also rely on this class to generate loops.
+class SparseTensorLoopEmitter {
+public:
+  /// Optional callback function to setup dense output tensors when initialize
+  /// the loop emitter (e.g.,to fill a dense output with zeros).
+  using OutputUpdater = function_ref<Value(OpBuilder &builder, Location loc,
+                                           Value memref, Value tensor)>;
+
+  /// Constructor: take an array of tensors inputs, on which the generated loops
+  /// will iterate on. The index of the tensor in the array is also the
+  /// tensor id (tid) used in related functions.
+  /// If isSparseOut is set, loop emitter assume that the sparse output tensor
+  /// is empty, and will always generate loops on it based on the dim sizes.
+  explicit SparseTensorLoopEmitter(ValueRange tensors,
+                                   bool isLastOutput = false,
+                                   bool isSparseOut = false);
+
+  ///
+  /// Core functions.
+  ///
+
+  /// Starts a loop emitting session by generating all the buffers needed to
+  /// iterate tensors.
+  void initializeLoopEmit(OpBuilder &builder, Location loc,
+                          OutputUpdater updater = nullptr);
+
+  /// Enters a new loop sequence, the loops within the same sequence starts from
+  /// the break points of previous loop instead of starting over from 0.
+  /// e.g.,
+  /// {
+  ///   // loop sequence start.
+  ///   p0 = while(xxx)
+  ///     ...
+  ///     break p0
+  ///
+  ///   for (p0 -> end) // see how the loop starts from p0 instead of 0.
+  ///     ...
+  ///   // loop sequence end.
+  /// }
+  void enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+                       ArrayRef<size_t> dims);
+
+  void exitCurrentLoopSeq() {
+    assert(loopSeqStack.size() == loopStack.size() + 1);
+    loopSeqStack.pop_back();
+  }
+
+  // TODO: Gets rid of `dim` in the argument list? Track the dimension we
+  // are currently at internally. Then it would be enterNextDimForTensor.
+  // Still need a way to specify the dim for non annoated dense tensor though,
+  // as it can be accessed out of order.
+  /// Emits loop over tensor[dim], it assumes that loops between
+  /// tensor[0...dim - 1] have already been generated.
+  /// It also prepares to enter tensor[dim + 1].
+  /// The function will also perform in-place update on the `reduc` vector to
+  /// return the reduction variable used inside the generated loop.
+  Operation *enterLoopOverTensorAtDim(OpBuilder &builder, Location loc,
+                                      size_t tid, size_t dim,
+                                      MutableArrayRef<Value> reduc = {},
+                                      bool isParallel = false,
+                                      ArrayRef<size_t> extraTids = {},
+                                      ArrayRef<size_t> extraDims = {});
+
+  /// Emits a coiteration loop over a set of tensors.
+  Operation *enterCoiterationOverTensorsAtDims(
+      OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+      ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc = {},
+      ArrayRef<size_t> extraTids = {}, ArrayRef<size_t> extraDims = {});
+
+  SmallVector<Value, 2> exitCurrentLoop(OpBuilder &builder, Location loc,
+                                        ArrayRef<Value> reduc = {});
+
+  /// Return the array of coordinate for all the loop generated till now.
+  void getCoordinateArray(SmallVectorImpl<Value> &coords) const {
+    for (auto &l : loopStack)
+      coords.push_back(l.iv);
+  }
+
+  /// Get loop induction variable at the given level.
+  Value getLoopIV(size_t level) const {
+    if (level < loopStack.size())
+      return loopStack[level].iv;
+    return nullptr;
+  }
+
+  ///
+  /// Getters.
+  ///
+  const std::vector<std::vector<Value>> &getPidxs() const { return pidxs; };
+  const std::vector<std::vector<Value>> &getCoord() const { return coord; };
+  const std::vector<std::vector<Value>> &getHighs() const { return highs; };
+  const std::vector<std::vector<Value>> &getPtrBuffer() const {
+    return ptrBuffer;
+  };
+  const std::vector<std::vector<Value>> &getIdxBuffer() const {
+    return idxBuffer;
+  };
+  const std::vector<Value> &getValBuffer() const { return valBuffer; };
+
+private:
+  struct LoopLevelInfo {
+    LoopLevelInfo(ArrayRef<size_t> tids, ArrayRef<size_t> dims, Operation *loop,
+                  Value iv)
+        : tids(tids), dims(dims), loop(loop), iv(iv) {}
+    // TODO: maybe use a vector<pair> for tid and dim?
+    // The set of tensors that the loop is operating on
+    const llvm::SmallVector<size_t, 4> tids;
+    // The corresponding dims for the tensors
+    const llvm::SmallVector<size_t, 4> dims;
+    const Operation *loop; // the loop operation
+    const Value iv;        // the induction variable for the loop
+  };
+
+  /// Linearizes address for dense dimension (i.e., p = (i * d0) + j).
+  Value genAddress(OpBuilder &builder, Location loc, size_t tid, size_t dim,
+                   Value iv) {
+    Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1];
+    Value mul = builder.create<arith::MulIOp>(loc, highs[tid][dim], p);
+    Value add = builder.create<arith::AddIOp>(loc, mul, iv);
+    return add;
+  }
+
+  bool isOutputTensor(size_t tid) {
+    return isLastOutput && tid == tensors.size() - 1;
+  }
+
+  /// Setups [lo, hi] for iterating tensor[dim], it assumes that tensor[0
+  /// ...dims-1] has already been setup.
+  void prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid,
+                                  size_t dim);
+
+  /// Emits extra locals, since the locals might not be in simplified lattices
+  /// point used to generate the loops, but are still required to generates
+  /// expressions.
+  void emitExtraLocalsForTensorsAtDenseDims(OpBuilder &builder, Location loc,
+                                            ArrayRef<size_t> tids,
+                                            ArrayRef<size_t> dims);
+
+  /// Exits a for loop, returns the reduction results, e.g.,
+  /// %ret = for () {
+  ///   ...
+  ///   yield %val
+  /// }
+  /// Return %ret to user, while %val is provided by users (`reduc`)
+  SmallVector<Value, 2> exitForLoop(OpBuilder &builder, Location loc,
+                                    ArrayRef<Value> reduc);
+
+  /// Exits a while loop, returns the reduction results.
+  SmallVector<Value, 2> exitCoiterationLoop(OpBuilder &builder, Location loc,
+                                            ArrayRef<Value> reduc);
+
+  // Is the last tensor output tensor
+  bool isLastOutput;
+  /// Input and (optional) output tensors.
+  std::vector<Value> tensors;
+  /// The dim type array for each tensor.
+  std::vector<std::vector<DimLevelType>> dimTypes;
+  /// Sparse iteration information (by tensor and dim). These arrays
+  /// are updated to remain current within the current loop.
+  std::vector<std::vector<Value>> pidxs;
+  std::vector<std::vector<Value>> coord;
+  std::vector<std::vector<Value>> highs;
+  /// Universal dense indices and upper bounds (by index). The sizes array is
+  /// set once with the inferred dimension sizes.
+  std::vector<std::vector<Value>> ptrBuffer; // to_pointers
+  std::vector<std::vector<Value>> idxBuffer; // to_indices
+  std::vector<Value> valBuffer;              // to_value
+
+  // Loop Stack, stores the information of all the nested loops that are alive.
+  std::vector<LoopLevelInfo> loopStack;
+
+  // Loop Sequence Stack, stores the unversial index for the current loop
+  // sequence.
+  std::vector<Value> loopSeqStack;
+
+  // TODO: not yet used, it should track the current level for each tensor
+  // to help eliminate `dim` paramters from above APIs.
+  // std::vector<size_t> curLv;
+};
+
 } // namespace sparse_tensor
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -95,45 +95,49 @@
 //===----------------------------------------------------------------------===//
 
 SparseTensorLoopEmitter::SparseTensorLoopEmitter(ValueRange tensors,
-                                                 bool isLastOutput)
-    : tensors(tensors.begin(), tensors.end()), dims(tensors.size()),
-      pidxs(tensors.size()), coord(tensors.size()), highs(tensors.size()),
-      sizes(tensors.size()), ptrBuffer(tensors.size()),
-      idxBuffer(tensors.size()), valBuffer(tensors.size()),
-      isLastOutput(isLastOutput), loopStack(), curLv(tensors.size(), 0) {
-  for (size_t i = 0, e = tensors.size(); i < e; i++) {
-    auto t = tensors[i];
-    auto rtp = t.getType().dyn_cast<RankedTensorType>();
-    if (!rtp) // a scalar (0-dimension tensors)
+                                                 bool isLastOutput,
+                                                 bool isSparseOut)
+    : isLastOutput(isLastOutput), tensors(tensors.begin(), tensors.end()),
+      dimTypes(tensors.size()), pidxs(tensors.size()), coord(tensors.size()),
+      highs(tensors.size()), ptrBuffer(tensors.size()),
+      idxBuffer(tensors.size()), valBuffer(tensors.size()), loopStack() {
+  for (size_t tid = 0, e = tensors.size(); tid < e; tid++) {
+    auto t = tensors[tid];
+    // a scalar or 0-dimension tensors
+    if (isZeroRankedTensorOrScalar(t.getType()))
       continue;
-
+    auto rtp = t.getType().cast<RankedTensorType>();
     auto rank = static_cast<size_t>(rtp.getRank());
     auto enc = getSparseTensorEncoding(rtp);
-    if (enc)
+    // We always treat sparse output tensor as dense so that we always iterate
+    // it based on dim size.
+    if (enc && !(isOutputTensor(tid) && isSparseOut))
       for (auto dimTp : enc.getDimLevelType())
-        dims[i].push_back(dimTp);
+        dimTypes[tid].push_back(dimTp);
     else
-      dims[i].assign(rank, DimLevelType::Dense);
+      dimTypes[tid].assign(rank, DimLevelType::Dense);
 
     // Initialize using empty value.
-    pidxs[i].assign(rank, Value());
-    coord[i].assign(rank, Value());
-    highs[i].assign(rank, Value());
-    sizes[i].assign(rank, Value());
-    ptrBuffer[i].assign(rank, Value());
-    idxBuffer[i].assign(rank, Value());
+    pidxs[tid].assign(rank, Value());
+    coord[tid].assign(rank, Value());
+    highs[tid].assign(rank, Value());
+    ptrBuffer[tid].assign(rank, Value());
+    idxBuffer[tid].assign(rank, Value());
   }
 }
 
-void SparseTensorLoopEmitter::initializeLoopEmit(OpBuilder &builder,
-                                                 Location loc) {
+void SparseTensorLoopEmitter::initializeLoopEmit(
+    OpBuilder &builder, Location loc,
+    SparseTensorLoopEmitter::OutputUpdater updater) {
   // For every tensor, find lower and upper bound on dimensions, set the
   // same bounds on loop indices, and obtain dense or sparse buffer(s).
-  // TODO: Provides ability to generate loop on output buffer (with undef
-  // dim level in Merger in GenericOp Sparsification).
   for (size_t t = 0, e = tensors.size(); t < e; t++) {
     auto tensor = tensors[t];
-    auto rtp = tensor.getType().cast<RankedTensorType>();
+    auto rtp = tensor.getType().dyn_cast<RankedTensorType>();
+    if (!rtp)
+      // Skips only scalar, zero ranked tensor still need to be bufferized and
+      // (probably) filled with zeros by users.
+      continue;
     auto rank = rtp.getRank();
     auto shape = rtp.getShape();
     auto enc = getSparseTensorEncoding(rtp);
@@ -141,10 +145,9 @@
     // Scan all dimensions of current tensor.
     for (int64_t d = 0; d < rank; d++) {
       // This should be called only once at beginning.
-      assert(!ptrBuffer[t][d] && !idxBuffer[t][d] && !sizes[t][d] &&
-             !highs[t][d]);
+      assert(!ptrBuffer[t][d] && !idxBuffer[t][d] && !highs[t][d]);
       // Handle sparse storage schemes.
-      if (isCompressedDLT(dims[t][d])) {
+      if (isCompressedDLT(dimTypes[t][d])) {
         auto ptrTp =
             MemRefType::get(dynShape, getPointerOverheadType(builder, enc));
         auto indTp =
@@ -153,7 +156,7 @@
         // Generate sparse primitives to obtains pointer and indices.
         ptrBuffer[t][d] = builder.create<ToPointersOp>(loc, ptrTp, tensor, dim);
         idxBuffer[t][d] = builder.create<ToIndicesOp>(loc, indTp, tensor, dim);
-      } else if (isSingletonDLT(dims[t][d])) {
+      } else if (isSingletonDLT(dimTypes[t][d])) {
         // Singleton dimension, fetch indices.
         auto indTp =
             MemRefType::get(dynShape, getIndexOverheadType(builder, enc));
@@ -161,105 +164,225 @@
         idxBuffer[t][d] = builder.create<ToIndicesOp>(loc, indTp, tensor, dim);
       } else {
         // Dense dimension, nothing to fetch.
-        assert(isDenseDLT(dims[t][d]));
+        assert(isDenseDLT(dimTypes[t][d]));
       }
 
       // Find upper bound in current dimension.
       unsigned p = toOrigDim(enc, d);
       Value up = mlir::linalg::createOrFoldDimOp(builder, loc, tensor, p);
-      sizes[t][d] = highs[t][d] = up;
+      highs[t][d] = up;
     }
+
     // Perform the required bufferization. Dense inputs materialize
-    // from the input tensors. Dense outputs need special handling.
-    // Sparse inputs use sparse primitives to obtain the values.
+    // from the input tensors. Sparse inputs use sparse primitives to obtain the
+    // values.
+    // Delegates extra output initialization to clients.
+    bool isOutput = isOutputTensor(t);
     Type elementType = rtp.getElementType();
-
     if (!enc) {
       // Non-annotated dense tensors.
       auto denseTp = MemRefType::get(shape, elementType);
-      if (isLastOutput && t == tensors.size() - 1)
-        llvm_unreachable("TODO: not yet handled");
-      else
-        valBuffer[t] =
-            builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
+      Value denseVal =
+          builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
+      // Dense outputs need special handling.
+      if (isOutput && updater)
+        denseVal = updater(builder, loc, denseVal, tensor);
+
+      valBuffer[t] = denseVal;
     } else {
       // Annotated sparse tensors.
+      // We also need the value buffer for annotated all dense `sparse` tensor.
       auto dynShape = {ShapedType::kDynamicSize};
       auto sparseTp = MemRefType::get(dynShape, elementType);
       valBuffer[t] = builder.create<ToValuesOp>(loc, sparseTp, tensor);
     }
-    // Prepare to enter the first dim for all (input) tensors
-    prepareLoopOverTensorAtDim(builder, loc, t, 0);
+    // NOTE: we can also prepares for 0 dim here in advance, this will hosit
+    // some loop preparation from tensor iteration, but will also (undesirably)
+    // hosit the code ouside if conditions.
   }
 }
 
+void SparseTensorLoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc,
+                                              ArrayRef<size_t> tids,
+                                              ArrayRef<size_t> dims) {
+  // Universal Index start from 0
+  assert(loopSeqStack.size() == loopStack.size());
+  // Universal index starts from 0
+  loopSeqStack.emplace_back(constantIndex(builder, loc, 0));
+  // Prepares for all the tensors used in the current loop sequence.
+  for (auto [tid, dim] : llvm::zip(tids, dims))
+    prepareLoopOverTensorAtDim(builder, loc, tid, dim);
+}
+
 Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
     OpBuilder &builder, Location loc, size_t tid, size_t dim,
-    ArrayRef<Value> reduc) {
-  assert(dims[tid].size() > dim);
+    MutableArrayRef<Value> reduc, bool isParallel, ArrayRef<size_t> extraTids,
+    ArrayRef<size_t> extraDims) {
+  assert(dimTypes[tid].size() > dim);
   // We can not re-enter the same level.
   assert(!coord[tid][dim]);
+
   Value step = constantIndex(builder, loc, 1);
-  auto dimType = dims[tid][dim];
-  bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType);
+  auto dimType = dimTypes[tid][dim];
+  bool isSparseInput = isCompressedDLT(dimType) || isSingletonDLT(dimType);
   assert(isDenseDLT(dimType) || isCompressedDLT(dimType) ||
          isSingletonDLT(dimType));
 
-  Value lo = isSparse ? pidxs[tid][dim] : constantIndex(builder, loc, 0);
+  Value lo = isSparseInput ? pidxs[tid][dim]      // current offset
+                           : loopSeqStack.back(); // univeral tid
   Value hi = highs[tid][dim];
 
-  // TODO: support reduction.
-  if (!reduc.empty())
-    llvm_unreachable("TODO: not implemented yet");
-
   scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
   builder.setInsertionPointToStart(forOp.getBody());
   Value iv = forOp.getInductionVar();
-  Operation *loop = forOp;
-
   assert(iv);
-  if (isSparse) {
+  if (isSparseInput) {
     pidxs[tid][dim] = iv;
     // Generating a load on the indices array yields the coordinate.
     Value ptr = idxBuffer[tid][dim];
-    // TODO: generates load for vector value.
     coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv);
   } else {
     // Dense tensor, the coordinates is the inducation variable.
     coord[tid][dim] = iv;
     // generate pidx for dense dim (pidx = i * sz + j)
-    // TODO: handle vector loop.
-    Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1];
-    Value mul = builder.create<arith::MulIOp>(loc, sizes[tid][dim], p);
-    Value add = builder.create<arith::AddIOp>(loc, mul, iv);
-    pidxs[tid][dim] = add;
+    auto enc = getSparseTensorEncoding(tensors[tid].getType());
+    if (enc)
+      pidxs[tid][dim] = genAddress(builder, loc, tid, dim, iv);
   }
 
-  // Prepares for next dim if this is not currently the innermost dimension.
-  if (dim != dims[tid].size() - 1)
-    prepareLoopOverTensorAtDim(builder, loc, tid, dim + 1);
+  // NOTE: we can also prepares for next dim here in advance
+  // Push the loop into stack
+  loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), forOp,
+                         coord[tid][dim]);
+  // Emit extra locals.
+  emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims);
+
+  // In-place update on the reduction variable vector.
+  assert(forOp.getNumRegionIterArgs() == reduc.size());
+  for (int i = 0, e = reduc.size(); i < e; i++)
+    reduc[i] = forOp.getRegionIterArg(i);
+  return forOp;
+}
+
+Operation *SparseTensorLoopEmitter::enterCoiterationOverTensorsAtDims(
+    OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+    ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc,
+    ArrayRef<size_t> extraTids, ArrayRef<size_t> extraDims) {
+  assert(tids.size() == dims.size());
+  SmallVector<Type, 4> types;
+  SmallVector<Value, 4> operands;
+  // Construct the while-loop with a parameter for each index.
+  Type indexType = builder.getIndexType();
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    if (isCompressedDLT(dimTypes[tid][dim]) ||
+        isSingletonDLT(dimTypes[tid][dim])) {
+      assert(pidxs[tid][dim]);
+      types.push_back(indexType);
+      operands.push_back(pidxs[tid][dim]);
+    }
+  }
+  // The position where user-supplied reduction variable starts.
+  for (Value rec : reduc) {
+    types.push_back(rec.getType());
+    operands.push_back(rec);
+  }
+  if (needsUniv) {
+    types.push_back(indexType);
+    // Update universal index.
+    operands.push_back(loopSeqStack.back());
+  }
+  assert(types.size() == operands.size());
+  scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands);
+
+  SmallVector<Location> locs(types.size(), loc);
+  Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
+  Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
+
+  // Build the "before" region, which effectively consists
+  // of a conjunction of "i < upper" tests on all induction.
+  builder.setInsertionPointToStart(&whileOp.getBefore().front());
+  Value cond;
+  unsigned o = 0;
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    if (isCompressedDLT(dimTypes[tid][dim]) ||
+        isSingletonDLT(dimTypes[tid][dim])) {
+      Value op1 = before->getArgument(o);
+      Value op2 = highs[tid][dim];
+      Value opc = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
+                                                op1, op2);
+      cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc;
+      // Update
+      pidxs[tid][dim] = after->getArgument(o++);
+    }
+  }
+  builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
+
+  // Generates while body.
+  builder.setInsertionPointToStart(&whileOp.getAfter().front());
+  Value min;
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    // Prepares for next level.
+    if (isCompressedDLT(dimTypes[tid][dim]) ||
+        isSingletonDLT(dimTypes[tid][dim])) {
+      Value ptr = idxBuffer[tid][dim];
+      Value s = pidxs[tid][dim];
+      Value load = genIndexLoad(builder, loc, ptr, s);
+      coord[tid][dim] = load;
+      if (!needsUniv) {
+        if (min) {
+          Value cmp = builder.create<arith::CmpIOp>(
+              loc, arith::CmpIPredicate::ult, load, min);
+          min = builder.create<arith::SelectOp>(loc, cmp, load, min);
+        } else {
+          min = load;
+        }
+      }
+    }
+  }
 
-  loopStack.push_back(LoopLevelInfo({tid}, {dim}, coord[tid][dim]));
-  return loop;
-}
+  if (needsUniv) {
+    assert(!min);
+    // Otherwise, universal index is the minimal pidx.
+    min = after->getArguments().back();
+  }
+
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    // All dense dim (as well as sparse output tensor) shared the same pidx in
+    // the while loop.
+    if (isDenseDLT(dimTypes[tid][dim])) {
+      pidxs[tid][dim] = min;
+      // generate pidx for dense dim (pidx = i * sz + j)
+      auto enc = getSparseTensorEncoding(tensors[tid].getType());
+      if (enc)
+        pidxs[tid][dim] = genAddress(builder, loc, tid, dim, min);
+    }
+    // NOTE: we can also prepares for next dim here in advance
+  }
+  // Sets up the loop stack.
+  loopStack.emplace_back(tids, dims, whileOp, min);
+  assert(loopStack.size() == loopSeqStack.size());
 
-void SparseTensorLoopEmitter::enterCoiterationOverTensorsAtDims(
-    OpBuilder &builder, Location loc, ArrayRef<size_t> ts,
-    ArrayRef<size_t> ds) {
-  llvm_unreachable("TODO: unimplemented");
+  // Emits extra locals
+  emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims);
+
+  // Updates reduction variables
+  assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0));
+  // In-place update on reduction variable.
+  for (unsigned i = 0, e = reduc.size(); i < e; i++)
+    reduc[i] = after->getArgument(o + i);
+
+  return whileOp;
 }
 
-bool SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder,
+void SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder,
                                                          Location loc,
                                                          size_t tid,
                                                          size_t dim) {
-  // TODO: generate loop iteration on output tensor based on the shape
-  // instead of pointer/indices arrays.
-  assert(dims[tid].size() > dim);
-  auto dimType = dims[tid][dim];
+  assert(dimTypes[tid].size() > dim);
+  auto dimType = dimTypes[tid][dim];
 
   if (isDenseDLT(dimType))
-    return false;
+    return;
 
   // Either the first dimension, or the previous dimension has been set.
   assert(dim == 0 || pidxs[tid][dim - 1]);
@@ -269,11 +392,11 @@
     Value ptr = ptrBuffer[tid][dim];
 
     Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
-    Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
-
     pidxs[tid][dim] = genIndexLoad(builder, loc, ptr, pLo);
+
+    Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
     highs[tid][dim] = genIndexLoad(builder, loc, ptr, pHi);
-    return true;
+    return;
   }
   if (isSingletonDLT(dimType)) {
     Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
@@ -281,34 +404,139 @@
 
     pidxs[tid][dim] = pLo;
     highs[tid][dim] = pHi;
-    return true;
+    return;
   }
 
   llvm_unreachable("Unrecognizable dimesion type!");
 }
 
-Value SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDims(
-    OpBuilder &builder, Location loc, size_t tid, size_t dim) {
-  llvm_unreachable("TODO: not implemented yet");
+// FIXME: Make this call private
+void SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDenseDims(
+    OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+    ArrayRef<size_t> dims) {
+  // Initialize dense positions. Note that we generate dense indices of the
+  // output tensor unconditionally, since they may not appear in the lattice,
+  // but may be needed for linearized codegen.
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    assert(isDenseDLT(dimTypes[tid][dim]));
+    auto enc = getSparseTensorEncoding(tensors[tid].getType());
+    if (enc) {
+      bool validPidx = dim == 0 || pidxs[tid][dim - 1];
+      if (!validPidx) {
+        // We might not find the pidx for the sparse output tensor as it is
+        // unconditionally required by the sparsification.
+        assert(isOutputTensor(tid));
+        continue;
+      }
+      pidxs[tid][dim] = genAddress(builder, loc, tid, dim, loopStack.back().iv);
+      // NOTE: we can also prepares for next dim here in advance
+    }
+  }
 }
 
-void SparseTensorLoopEmitter::exitCurrentLoop() {
-  // Clean up the values, it would help use to discover potential bug at a
-  // earlier stage (instead of silently using a wrong value).
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc,
+                                     ArrayRef<Value> reduc) {
   LoopLevelInfo &loopInfo = loopStack.back();
-  assert(loopInfo.tensors.size() == loopInfo.dims.size());
-  for (auto info : llvm::zip(loopInfo.tensors, loopInfo.dims)) {
-    auto tid = std::get<0>(info);
-    auto dim = std::get<1>(info);
-    assert(pidxs[tid][dim] && coord[tid][dim] && highs[tid][dim]);
+  auto &dims = loopStack.back().dims;
+  auto &tids = loopStack.back().tids;
+  auto forOp = llvm::cast<scf::ForOp>(loopInfo.loop);
+  if (!reduc.empty()) {
+    assert(reduc.size() == forOp.getNumResults());
+    builder.setInsertionPointToEnd(forOp.getBody());
+    builder.create<scf::YieldOp>(loc, reduc);
+  }
+
+  // Finished iterating a tensor, clean up
+  // We only do the clean up on for loop as while loops do not necessarily
+  // finish the iteration on a sparse tensor
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
     // Reset to null.
-    pidxs[tid][dim] = Value();
     coord[tid][dim] = Value();
-    if (!isDenseDLT(dims[tid][dim]))
-      // Dense dimension, high is fixed.
+    pidxs[tid][dim] = Value();
+    // Dense dimension, high is fixed.
+    if (!isDenseDLT(dimTypes[tid][dim]))
       highs[tid][dim] = Value();
   }
+  // exit the loop
+  builder.setInsertionPointAfter(forOp);
+  return forOp.getResults();
+}
+
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc,
+                                             ArrayRef<Value> reduc) {
+  auto whileOp = llvm::cast<scf::WhileOp>(loopStack.back().loop);
+  auto &dims = loopStack.back().dims;
+  auto &tids = loopStack.back().tids;
+  Value iv = loopStack.back().iv;
+  // Generation while loop induction at the end.
+  builder.setInsertionPointToEnd(&whileOp.getAfter().front());
+  // Finalize the induction. Note that the induction could be performed
+  // in the individual if-branches to avoid re-evaluating the conditions.
+  // However, that would result in a rather elaborate forest of yield
+  // instructions during code generation. Moreover, performing the induction
+  // after the if-statements more closely resembles code generated by TACO.
+  unsigned o = 0;
+  SmallVector<Value, 4> operands;
+  Value one = constantIndex(builder, loc, 1);
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    if (isCompressedDLT(dimTypes[tid][dim]) ||
+        isSingletonDLT(dimTypes[tid][dim])) {
+      Value op1 = coord[tid][dim];
+      Value op3 = pidxs[tid][dim];
+      Value cmp =
+          builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, op1, iv);
+      Value add = builder.create<arith::AddIOp>(loc, op3, one);
+      operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, op3));
+      // Following loops continue iteration from the break point of the
+      // current while loop.
+      pidxs[tid][dim] = whileOp->getResult(o++);
+      // The coordinates are invalid now.
+      coord[tid][dim] = nullptr;
+      // highs remains unchanged.
+    }
+  }
+
+  // Reduction value from users.
+  SmallVector<Value, 2> ret;
+  for (auto red : reduc) {
+    operands.push_back(red);
+    ret.push_back(whileOp->getResult(o++));
+  }
+
+  // An (optional) universal index.
+  if (operands.size() < whileOp.getNumResults()) {
+    assert(operands.size() + 1 == whileOp.getNumResults());
+    // The last one is the universial index.
+    operands.push_back(builder.create<arith::AddIOp>(loc, iv, one));
+    // update the loop starting point of current loop sequence
+    loopSeqStack.back() = whileOp->getResult(o++);
+  }
+
+  assert(o == operands.size());
+  builder.create<scf::YieldOp>(loc, operands);
+  builder.setInsertionPointAfter(whileOp);
+  return ret;
+}
+
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitCurrentLoop(OpBuilder &builder, Location loc,
+                                         ArrayRef<Value> reduc) {
+  // Clean up the values, it would help use to discover potential bug at a
+  // earlier stage (instead of silently using a wrong value).
+  LoopLevelInfo &loopInfo = loopStack.back();
+  assert(loopInfo.tids.size() == loopInfo.dims.size());
+  SmallVector<Value, 2> red;
+  if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
+    red = exitCoiterationLoop(builder, loc, reduc);
+  } else {
+    red = exitForLoop(builder, loc, reduc);
+  }
+
+  assert(loopStack.size() == loopSeqStack.size());
   loopStack.pop_back();
+  return red;
 }
 
 //===----------------------------------------------------------------------===//
@@ -500,9 +728,9 @@
     auto srcDim = srcShape[i];
     // Iterate through dimensions expanded from the i-th dimension.
     for (unsigned j = start; j < start + map.size(); j++) {
-      // There can be only one dynamic sized dimension among dimensions expanded
-      // from the i-th dimension in srcShape. For example, if srcDim = 8, then
-      // the expanded shape could be <2x?x2>, but not <2x?x?>.
+      // There can be only one dynamic sized dimension among dimensions
+      // expanded from the i-th dimension in srcShape. For example, if srcDim
+      // = 8, then the expanded shape could be <2x?x2>, but not <2x?x?>.
       if (staticDstShape[j] == ShapedType::kDynamicSize) {
         // The expanded dimension has dynamic size. We compute the dimension
         // by dividing srcDim by the product of the static dimensions.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -477,29 +477,35 @@
     // 1. Generates loop for the sparse input.
     SparseTensorLoopEmitter loopEmitter(ValueRange{input});
     loopEmitter.initializeLoopEmit(rewriter, loc);
-    for (int64_t i = 0; i < rank; i++)
+    for (int64_t i = 0; i < rank; i++) {
+      // TODO: provide utility function for loop sequences that only contains
+      // one for loop?
+      loopEmitter.enterNewLoopSeq(rewriter, loc, 0, static_cast<size_t>(i));
       loopEmitter.enterLoopOverTensorAtDim(rewriter, loc, 0, i);
+    }
 
     SmallVector<Value, 4> coords;
     coords.reserve(rank);
     loopEmitter.getCoordinateArray(coords);
 
-    Value vals = loopEmitter.getTensorValueBuffer(0);
-    Value pidx = loopEmitter.getLastLevelTensorPointerIndex(0);
+    Value vals = loopEmitter.getValBuffer()[0];
+    Value pidx = loopEmitter.getPidxs()[0].back();
     // Loads the value from sparse tensor using pointer index;
     // loads the value from dense tensor using coordinate array.
     Value val = enc ? rewriter.create<memref::LoadOp>(loc, vals, pidx)
                     : rewriter.create<memref::LoadOp>(loc, vals, coords);
 
-    for (int64_t i = 0; i < rank; i++)
-      loopEmitter.exitCurrentLoop();
-
     // 2. Inline the block in the foreach operator.
     Block::iterator inlinePos = rewriter.getInsertionPoint();
     Block *srcBlock = op.getBody();
     // Remove sparse_tensor.yield.
     rewriter.eraseOp(srcBlock->getTerminator());
 
+    for (int64_t i = 0; i < rank; i++) {
+      loopEmitter.exitCurrentLoop(rewriter, loc);
+      loopEmitter.exitCurrentLoopSeq();
+    }
+
     SmallVector<Value, 4> args;
     // Remap coordinates.
     for (int64_t i = 0; i < rank; i++) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -40,6 +40,8 @@
 
 namespace {
 
+constexpr unsigned INVALID_ID = std::numeric_limits<unsigned>::max();
+
 // Iteration graph sorting.
 enum SortMask {
   kSparseOnly = 0x0,
@@ -53,34 +55,16 @@
 
 // Code generation.
 struct CodeGen {
-  CodeGen(SparsificationOptions o, unsigned numTensors, unsigned numLoops,
-          OpOperand *op, unsigned nest, std::vector<unsigned> &ts)
-      : options(o), loops(numLoops), sizes(numLoops), buffers(numTensors),
-        pointers(numTensors, std::vector<Value>(numLoops)),
-        indices(numTensors, std::vector<Value>(numLoops)),
-        highs(numTensors, std::vector<Value>(numLoops)),
-        pidxs(numTensors, std::vector<Value>(numLoops)),
-        idxs(numTensors, std::vector<Value>(numLoops)), sparseOut(op),
-        outerParNest(nest), topSort(ts) {}
+  CodeGen(SparsificationOptions o, ValueRange tensors, unsigned numTensors,
+          unsigned numLoops, OpOperand *op, unsigned nest,
+          std::vector<unsigned> &ts)
+      : options(o), loopEmitter(tensors, /*isLastOutput=*/true,
+                                /*isSparseOut=*/op != nullptr),
+        sparseOut(op), outerParNest(nest), topSort(ts) {}
   /// Sparsification options.
   SparsificationOptions options;
-  /// Universal dense indices and upper bounds (by index). The loops array
-  /// is updated with the value of the universal dense index in the current
-  /// loop. The sizes array is set once with the inferred dimension sizes.
-  std::vector<Value> loops;
-  std::vector<Value> sizes;
-  /// Buffers for storing dense and sparse numerical values (by tensor).
-  /// This array is set once during bufferization of all tensors.
-  std::vector<Value> buffers;
-  /// Sparse storage schemes (1-D): pointers and indices (by tensor and index).
-  /// This array is set once during bufferization of all sparse tensors.
-  std::vector<std::vector<Value>> pointers;
-  std::vector<std::vector<Value>> indices;
-  /// Sparse iteration information (by tensor and index). These arrays
-  /// are updated to remain current within the current loop.
-  std::vector<std::vector<Value>> highs;
-  std::vector<std::vector<Value>> pidxs;
-  std::vector<std::vector<Value>> idxs;
+  /// Loop emitter helper class.
+  SparseTensorLoopEmitter loopEmitter;
   /// Current reduction, updated during code generation. When indices of a
   /// reduction are exhausted, all inner loops can use a scalarized reduction.
   unsigned redExp = -1u;
@@ -98,8 +82,48 @@
   Value expCount;
   // Topsort (reference should remain in scope).
   std::vector<unsigned> &topSort;
+
+  // From tensor id + loop id => dim id.
+  // TODO: This map should probably be maintained by Merger (it can be set up
+  // together with dimLvlType Map).
+  std::vector<std::vector<unsigned>> loopIdxToDim;
+
+  // Initialize the above two mapping.
+  void buildLoopIdxToDimMap(linalg::GenericOp op);
+
+  Value getLoopIdxValue(size_t loopIdx) const {
+    for (unsigned lv = 0; lv < topSort.size(); lv++)
+      if (topSort[lv] == loopIdx)
+        return loopEmitter.getLoopIV(lv);
+
+    llvm_unreachable("invalid loop index");
+  }
 };
 
+void CodeGen::buildLoopIdxToDimMap(linalg::GenericOp op) {
+  size_t numLoops = op.getNumLoops();
+  size_t numTensors = op.getNumOperands();
+  loopIdxToDim.assign(numTensors, std::vector<unsigned>(numLoops, INVALID_ID));
+
+  for (OpOperand &t : op->getOpOperands()) {
+    auto map = op.getMatchingIndexingMap(&t);
+    auto enc = getSparseTensorEncoding(t.get().getType());
+    // Scan all dimensions of current tensor.
+    unsigned tid = t.getOperandNumber();
+    for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
+      auto a = map.getResult(toOrigDim(enc, d)).dyn_cast<AffineDimExpr>();
+      if (a) {
+        unsigned loopId = a.getPosition();
+        // Fills the mapping.
+        loopIdxToDim[tid][loopId] = d;
+      }
+      // Else a compound affine, do nothing. (at least we are good for
+      // now, as we only support compound affine expr on non-annoated dense
+      // tensors).
+    }
+  }
+}
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -128,13 +152,15 @@
 /// same index is used more than once. Also rejects compound affine
 /// expressions in sparse dimensions.
 static bool findAffine(Merger &merger, unsigned tensor, AffineExpr a,
-                       DimLevelType dim) {
+                       DimLevelType dim, bool setLvlFormat = true) {
   switch (a.getKind()) {
   case AffineExprKind::DimId: {
     unsigned idx = a.cast<AffineDimExpr>().getPosition();
     if (!isUndefDLT(merger.getDimLevelType(tensor, idx)))
       return false; // used more than once
-    merger.setDimLevelType(tensor, idx, dim);
+
+    if (setLvlFormat)
+      merger.setDimLevelType(tensor, idx, dim);
     return true;
   }
   case AffineExprKind::Add:
@@ -142,8 +168,10 @@
     if (!isDenseDLT(dim))
       return false; // compound only in dense dim
     auto binOp = a.cast<AffineBinaryOpExpr>();
-    return findAffine(merger, tensor, binOp.getLHS(), dim) &&
-           findAffine(merger, tensor, binOp.getRHS(), dim);
+    // We do not set dim level format for affine expresssion like d0 + d1 on
+    // both loop index at d0 and d1,
+    return findAffine(merger, tensor, binOp.getLHS(), dim, false) &&
+           findAffine(merger, tensor, binOp.getRHS(), dim, false);
   }
   case AffineExprKind::Constant:
     return isDenseDLT(dim); // const only in dense dim
@@ -411,106 +439,45 @@
 // Sparse compiler synthesis methods (statements and expressions).
 //===----------------------------------------------------------------------===//
 
-/// Generates buffer for the output tensor. Note that all sparse kernels
-/// assume that when all elements are written to (viz. x(i) = y(i) * z(i)),
-/// the output buffer is already initialized to all zeroes and only nonzeroes
-/// values are computed and written out. For updates (viz. x(i) += y(i) * z(i)),
-/// only nonzeroes values are used for the updates and no assumption on the
-/// original contents of the output buffer is necessary.
-static Value genOutputBuffer(CodeGen &codegen, OpBuilder &builder,
-                             linalg::GenericOp op, MemRefType denseTp,
-                             ArrayRef<Value> args) {
-  Location loc = op.getLoc();
-  OpOperand *lhs = op.getOutputOperand(0);
-  Value tensor = lhs->get();
-  bool isInit = op.isInitTensor(lhs);
-  // An output tensor can simply materialize from the buffer of the tensor that
-  // appears in the outs() clause. For updates, this has the advantage that only
-  // the nonzero value are involved in the computation, keeping the operation
-  // O(nnz). In all other cases, we are forced to zero out the buffer to enforce
-  // the assumption above, which may negatively impact running complexity
-  // (viz. O(n^2 + nnz) vs. O(nnz) for matrices).
-  // TODO: use better analysis to avoid zeroing out the buffer?
-  Value init = builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
-  if (!isInit) {
-    Value zero = constantZero(builder, loc, denseTp.getElementType());
-    builder.create<linalg::FillOp>(loc, ValueRange{zero}, ValueRange{init});
-  }
-  return init;
-}
-
 /// Local bufferization of all dense and sparse data structures.
 static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                        linalg::GenericOp op) {
   Location loc = op.getLoc();
-  assert(op->getNumOperands() == op.getNumInputs() + 1);
-  // For every tensor, find lower and upper bound on dimensions, set the
-  // same bounds on loop indices, and obtain dense or sparse buffer(s).
-  auto dynShape = {ShapedType::kDynamicSize};
-  SmallVector<Value, 4> args;
-  for (OpOperand &t : op->getOpOperands()) {
-    unsigned tensor = t.getOperandNumber();
-    auto shape = op.getShape(&t);
-    auto map = op.getMatchingIndexingMap(&t);
-    auto enc = getSparseTensorEncoding(t.get().getType());
-    // Scan all dimensions of current tensor.
-    args.clear();
-    for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
-      AffineExpr a = map.getResult(toOrigDim(enc, d));
-      if (a.getKind() != AffineExprKind::DimId)
-        continue; // compound
-      unsigned idx = a.cast<AffineDimExpr>().getPosition();
-      // Handle the different storage schemes.
-      if (isCompressedDLT(merger.getDimLevelType(tensor, idx))) {
-        // Compressed dimension, fetch pointer and indices.
-        auto ptrTp =
-            MemRefType::get(dynShape, getPointerOverheadType(builder, enc));
-        auto indTp =
-            MemRefType::get(dynShape, getIndexOverheadType(builder, enc));
-        auto dim = builder.getIndexAttr(d);
-        codegen.pointers[tensor][idx] =
-            builder.create<ToPointersOp>(loc, ptrTp, t.get(), dim);
-        codegen.indices[tensor][idx] =
-            builder.create<ToIndicesOp>(loc, indTp, t.get(), dim);
-      } else if (isSingletonDLT(merger.getDimLevelType(tensor, idx))) {
-        // Singleton dimension, fetch indices.
-        auto indTp =
-            MemRefType::get(dynShape, getIndexOverheadType(builder, enc));
-        auto dim = builder.getIndexAttr(d);
-        codegen.indices[tensor][idx] =
-            builder.create<ToIndicesOp>(loc, indTp, t.get(), dim);
-      } else {
-        // Dense dimension, nothing to fetch.
-        assert(isDenseDLT(merger.getDimLevelType(tensor, idx)));
-      }
-      // Find upper bound in current dimension.
-      unsigned p = toOrigDim(enc, d);
-      Value up = linalg::createOrFoldDimOp(builder, loc, t.get(), p);
-      if (ShapedType::isDynamic(shape[p]))
-        args.push_back(up);
-      assert(codegen.highs[tensor][idx] == nullptr);
-      codegen.sizes[idx] = codegen.highs[tensor][idx] = up;
-    }
-    // Perform the required bufferization. Dense inputs materialize
-    // from the input tensors. Dense outputs need special handling.
-    // Sparse inputs use sparse primitives to obtain the values.
-    Type elementType = getElementTypeOrSelf(t.get().getType());
-    if (!enc) {
-      // Non-annotated dense tensors.
-      auto denseTp = MemRefType::get(shape, elementType);
-      if (tensor < op.getNumInputs())
-        codegen.buffers[tensor] =
-            builder.create<bufferization::ToMemrefOp>(loc, denseTp, t.get());
-      else
-        codegen.buffers[tensor] =
-            genOutputBuffer(codegen, builder, op, denseTp, args);
-    } else if (&t != codegen.sparseOut) {
-      // Annotated sparse tensors (not involved in output).
-      auto sparseTp = MemRefType::get(dynShape, elementType);
-      codegen.buffers[tensor] =
-          builder.create<ToValuesOp>(loc, sparseTp, t.get());
-    }
-  }
+  assert(op.getNumOperands() == op.getNumInputs() + 1);
+
+  codegen.loopEmitter.initializeLoopEmit(
+      builder, loc,
+      /// Generates buffer for the output tensor. Note that all sparse kernels
+      /// assume that when all elements are written to (viz. x(i) = y(i) *
+      /// z(i)), the output buffer is already initialized to all zeroes and only
+      /// nonzeroes values are computed and written out. For updates (viz. x(i)
+      /// += y(i) * z(i)), only nonzeroes values are used for the updates and no
+      /// assumption on the original contents of the output buffer is necessary.
+      [&op](OpBuilder &builder, Location loc, Value memref,
+            Value tensor) -> Value {
+        // Must not be a sparse tensor.
+        assert(!getSparseTensorEncoding(tensor.getType()));
+        OpOperand *lhs = op.getOutputOperand(0);
+        // Two output tensors should match.
+        assert(lhs->get() == tensor);
+        bool isInit = op.isInitTensor(lhs);
+        // An output tensor can simply materialize from the buffer of the tensor
+        // that appears in the outs() clause. For updates, this has the
+        // advantage that only the nonzero value are involved in the
+        // computation, keeping the operation O(nnz). In all other cases, we are
+        // forced to zero out the buffer to enforce the assumption above, which
+        // may negatively impact running complexity (viz. O(n^2 + nnz) vs.
+        // O(nnz) for matrices).
+        // TODO: use better analysis to avoid zeroing out the buffer?
+        Value init = memref;
+        if (!isInit) {
+          Value zero = constantZero(builder, loc,
+                                    getElementTypeOrSelf(tensor.getType()));
+          builder.create<linalg::FillOp>(loc, ValueRange{zero},
+                                         ValueRange{init});
+        }
+        return init;
+      });
 }
 
 /// Generates an affine expression.
@@ -522,7 +489,7 @@
   switch (a.getKind()) {
   case AffineExprKind::DimId: {
     unsigned idx = a.cast<AffineDimExpr>().getPosition();
-    return codegen.loops[idx]; // universal dense index
+    return codegen.getLoopIdxValue(idx); // universal dense index
   }
   case AffineExprKind::Add: {
     auto binOp = a.cast<AffineBinaryOpExpr>();
@@ -552,7 +519,7 @@
   AffineExpr a = map.getResult(toOrigDim(enc, map.getNumResults() - 1));
   assert(a.getKind() == AffineExprKind::DimId);
   unsigned idx = a.cast<AffineDimExpr>().getPosition();
-  return codegen.loops[idx];
+  return codegen.getLoopIdxValue(idx);
 }
 
 /// Generates subscript for load/store on a dense or sparse tensor.
@@ -566,18 +533,18 @@
   if (enc) {
     // Note that currently, all sparse subscripts are simple.
     // TODO: accept affine too?
-    AffineExpr a = map.getResult(toOrigDim(enc, rank - 1));
-    assert(a.getKind() == AffineExprKind::DimId);
-    unsigned idx = a.cast<AffineDimExpr>().getPosition();
-    assert(codegen.pidxs[tensor][idx] != nullptr);
-    args.push_back(codegen.pidxs[tensor][idx]); // position index
+    assert(map.getResult(toOrigDim(enc, rank - 1)).getKind() ==
+           AffineExprKind::DimId);
+    Value pidx = codegen.loopEmitter.getPidxs()[tensor].back();
+    assert(pidx);
+    args.push_back(pidx); // position index
   } else {
     for (unsigned d = 0; d < rank; d++) {
       AffineExpr a = map.getResult(d);
       args.push_back(genAffine(codegen, builder, a, op.getLoc()));
     }
   }
-  return codegen.buffers[tensor];
+  return codegen.loopEmitter.getValBuffer()[tensor];
 }
 
 /// Generates insertion code to implement dynamic tensor load.
@@ -622,8 +589,8 @@
     unsigned rank = op.getRank(t);
     SmallVector<Value, 4> indices;
     for (unsigned i = 0; i < rank; i++) {
-      assert(codegen.loops[codegen.topSort[i]]);
-      indices.push_back(codegen.loops[codegen.topSort[i]]);
+      assert(codegen.loopEmitter.getLoopIV(i));
+      indices.push_back(codegen.loopEmitter.getLoopIV(i));
     }
     builder.create<InsertOp>(loc, rhs, t->get(), indices);
     return;
@@ -717,42 +684,16 @@
   builder.create<memref::StoreOp>(loc, rhs, ptr, args);
 }
 
-/// Generates a pointer/index load from the sparse storage scheme. Narrower
-/// data types need to be zero extended before casting the value into the
-/// index type used for looping and indexing.
-static Value genLoad(CodeGen &codegen, OpBuilder &builder, Location loc,
-                     Value ptr, Value s) {
-  // Simply zero extends narrower indices into 64-bit values before casting to
-  // index without a performance penalty.
-  Value load = builder.create<memref::LoadOp>(loc, ptr, s);
-  if (!load.getType().isa<IndexType>()) {
-    if (load.getType().getIntOrFloatBitWidth() < 64)
-      load = builder.create<arith::ExtUIOp>(loc, builder.getI64Type(), load);
-    load =
-        builder.create<arith::IndexCastOp>(loc, builder.getIndexType(), load);
-  }
-  return load;
-}
-
 /// Generates an invariant value.
-static Value genInvariantValue(Merger &merger, CodeGen &codegen,
-                               OpBuilder &builder, unsigned exp) {
-  Value val = merger.exp(exp).val;
-  return val;
-}
-
-/// Generates an address computation "sz * p + i".
-static Value genAddress(CodeGen &codegen, OpBuilder &builder, Location loc,
-                        Value size, Value p, Value i) {
-  Value mul = builder.create<arith::MulIOp>(loc, size, p);
-  return builder.create<arith::AddIOp>(loc, mul, i);
+inline static Value genInvariantValue(Merger &merger, CodeGen &codegen,
+                                      OpBuilder &builder, unsigned exp) {
+  return merger.exp(exp).val;
 }
 
 /// Generates an index value.
-static Value genIndexValue(CodeGen &codegen, OpBuilder &builder, unsigned idx,
-                           unsigned ldx) {
-  Value ival = codegen.loops[idx];
-  return ival;
+inline static Value genIndexValue(CodeGen &codegen, OpBuilder &builder,
+                                  unsigned idx) {
+  return codegen.getLoopIdxValue(idx);
 }
 
 /// Semi-ring branches are simply inlined by the sparse compiler. Prior
@@ -764,7 +705,7 @@
                           Block *block, Value e, unsigned ldx) {
   if (Operation *def = e.getDefiningOp()) {
     if (auto indexOp = dyn_cast<linalg::IndexOp>(def))
-      return genIndexValue(codegen, rewriter, indexOp.getDim(), ldx);
+      return genIndexValue(codegen, rewriter, indexOp.getDim());
     if (def->getBlock() == block) {
       for (unsigned i = 0, n = def->getNumOperands(); i < n; i++)
         def->setOperand(
@@ -785,7 +726,7 @@
   if (merger.exp(exp).kind == Kind::kInvariant)
     return genInvariantValue(merger, codegen, rewriter, exp);
   if (merger.exp(exp).kind == Kind::kIndex)
-    return genIndexValue(codegen, rewriter, merger.exp(exp).index, ldx);
+    return genIndexValue(codegen, rewriter, merger.exp(exp).index);
 
   if (merger.exp(exp).kind == Kind::kReduce) {
     // Make custom reduction identity accessible for expanded access pattern.
@@ -826,7 +767,7 @@
     unsigned idx = a.cast<AffineDimExpr>().getPosition();
     if (idx == ldx)
       atLevel = true;
-    return codegen.loops[idx] != nullptr; // no longer in play?
+    return codegen.getLoopIdxValue(idx) != nullptr; // no longer in play?
   }
   case AffineExprKind::Add:
   case AffineExprKind::Mul: {
@@ -901,6 +842,7 @@
   if (!lhs || codegen.outerParNest != op.getRank(lhs) - 1 ||
       at != codegen.outerParNest)
     return; // not needed at this level
+  assert(codegen.redVal == nullptr);
   // Generate start or end of an expanded access pattern.
   Value tensor = lhs->get();
   Location loc = op.getLoc();
@@ -923,8 +865,8 @@
     assert(codegen.expValues);
     SmallVector<Value, 4> indices;
     for (unsigned i = 0; i < at; i++) {
-      assert(codegen.loops[codegen.topSort[i]]);
-      indices.push_back(codegen.loops[codegen.topSort[i]]);
+      assert(codegen.loopEmitter.getLoopIV(i));
+      indices.push_back(codegen.loopEmitter.getLoopIV(i));
     }
     builder.create<CompressOp>(loc, codegen.expValues, codegen.expFilled,
                                codegen.expAdded, codegen.expCount, tensor,
@@ -934,68 +876,9 @@
   }
 }
 
-/// Generates initialization code for the subsequent loop sequence at
-/// current index level. Returns true if the loop sequence needs to
-/// maintain the universal index.
-static bool genInit(Merger &merger, CodeGen &codegen, OpBuilder &builder,
-                    linalg::GenericOp op, unsigned at, BitVector &inits) {
-  std::vector<unsigned> &topSort(codegen.topSort);
-  bool needsUniv = false;
-  Location loc = op.getLoc();
-  unsigned idx = topSort[at];
-
-  // Initialize sparse positions.
-  for (unsigned b = 0, be = inits.size(); b < be; b++) {
-    if (!inits[b])
-      continue;
-    unsigned tensor = merger.tensor(b);
-    assert(idx == merger.index(b));
-    if (isCompressedDLT(merger.getDimLevelType(b))) {
-      // Initialize sparse index that will implement the iteration:
-      //   for pidx_idx = pointers(pidx_idx-1), pointers(1+pidx_idx-1)
-      unsigned pat = at;
-      for (; pat != 0; pat--) {
-        if (codegen.pidxs[tensor][topSort[pat - 1]])
-          break;
-      }
-      Value ptr = codegen.pointers[tensor][idx];
-      Value one = constantIndex(builder, loc, 1);
-      Value p0 = (pat == 0) ? constantIndex(builder, loc, 0)
-                            : codegen.pidxs[tensor][topSort[pat - 1]];
-      codegen.pidxs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p0);
-      Value p1 = builder.create<arith::AddIOp>(loc, p0, one);
-      codegen.highs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p1);
-    } else if (isSingletonDLT(merger.getDimLevelType(b))) {
-      // Initialize sparse index that will implement the "iteration":
-      //   for pidx_idx = pidx_idx-1, 1+pidx_idx-1
-      // We rely on subsequent loop unrolling to get rid of the loop
-      // if it is not involved in co-iteration with anything else.
-      unsigned pat = at;
-      for (; pat != 0; pat--) {
-        if (codegen.pidxs[tensor][topSort[pat - 1]])
-          break;
-      }
-      Value one = constantIndex(builder, loc, 1);
-      Value p0 = (pat == 0) ? constantIndex(builder, loc, 0)
-                            : codegen.pidxs[tensor][topSort[pat - 1]];
-      codegen.pidxs[tensor][idx] = p0;
-      codegen.highs[tensor][idx] = builder.create<arith::AddIOp>(loc, p0, one);
-    } else {
-      assert(isDenseDLT(merger.getDimLevelType(b)) ||
-             isUndefDLT(merger.getDimLevelType(b)));
-      // Dense index still in play.
-      needsUniv = true;
-    }
-  }
-
-  // Initialize the universal dense index.
-  codegen.loops[idx] = constantIndex(builder, loc, 0);
-  return needsUniv;
-}
-
-/// Returns parallelization strategy. Any implicit loop in the Linalg operation
-/// that is marked "parallel" is a candidate. Whether it is actually converted
-/// to a parallel operation depends on the requested strategy.
+/// Returns parallelization strategy. Any implicit loop in the Linalg
+/// operation that is marked "parallel" is a candidate. Whether it is actually
+/// converted to a parallel operation depends on the requested strategy.
 static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction,
                           bool isSparse) {
   // Reject parallelization of sparse output.
@@ -1020,32 +903,16 @@
 /// Generates a for-loop on a single index.
 static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                          linalg::GenericOp op, bool isOuter, bool isInner,
-                         unsigned idx, BitVector &indices) {
-  unsigned fb = indices.find_first();
-  unsigned tensor = merger.tensor(fb);
-  assert(idx == merger.index(fb));
+                         unsigned idx, size_t tid, size_t dim,
+                         ArrayRef<size_t> extraTids,
+                         ArrayRef<size_t> extraDims) {
+  Location loc = op.getLoc();
   auto iteratorTypes = op.getIteratorTypesArray();
   bool isReduction = linalg::isReductionIterator(iteratorTypes[idx]);
-  bool isSparse = isCompressedDLT(merger.getDimLevelType(fb)) ||
-                  isSingletonDLT(merger.getDimLevelType(fb));
+  bool isSparse = isCompressedDLT(merger.getDimLevelType(tid, idx)) ||
+                  isSingletonDLT(merger.getDimLevelType(tid, idx));
   bool isParallel = isParallelFor(codegen, isOuter, isReduction, isSparse);
-
-  // Loop bounds and increment.
-  Location loc = op.getLoc();
-  Value lo = isSparse ? codegen.pidxs[tensor][idx] : codegen.loops[idx];
-  Value hi = isSparse ? codegen.highs[tensor][idx] : codegen.sizes[idx];
-  Value step = constantIndex(builder, loc, 1);
-
-  // Emit a parallel loop.
-  if (isParallel) {
-    scf::ParallelOp parOp = builder.create<scf::ParallelOp>(loc, lo, hi, step);
-    if (isSparse)
-      codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0];
-    else
-      codegen.loops[idx] = parOp.getInductionVars()[0];
-    builder.setInsertionPointToStart(parOp.getBody());
-    return parOp;
-  }
+  assert(!isParallel);
 
   // Emit a sequential or vector loop.
   SmallVector<Value, 4> operands;
@@ -1054,182 +921,70 @@
   if (codegen.expValues)
     operands.push_back(codegen.expCount);
 
-  scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, operands);
+  Operation *loop = codegen.loopEmitter.enterLoopOverTensorAtDim(
+      builder, loc, tid, dim, operands, isParallel, extraTids, extraDims);
 
+  // The operands should be updated by loop emitter already.
   if (codegen.redVal)
-    updateReduc(merger, codegen, forOp.getRegionIterArgs().front());
+    updateReduc(merger, codegen, operands.front());
   if (codegen.expValues)
-    codegen.expCount = forOp.getRegionIterArgs().back();
-  // Assign induction variable to sparse or dense index.
-  Value iv = forOp.getInductionVar();
-  if (isSparse)
-    codegen.pidxs[tensor][idx] = iv;
-  else
-    codegen.loops[idx] = iv;
-
-  builder.setInsertionPointToStart(forOp.getBody());
-  return forOp;
+    codegen.expCount = operands.back();
+
+  return loop;
 }
 
 /// Emit a while-loop for co-iteration over multiple indices.
 static Operation *genWhile(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                            linalg::GenericOp op, unsigned idx, bool needsUniv,
-                           BitVector &indices) {
-  SmallVector<Type, 4> types;
+                           ArrayRef<size_t> condTids, ArrayRef<size_t> condDims,
+                           ArrayRef<size_t> extraTids,
+                           ArrayRef<size_t> extraDims) {
+
   SmallVector<Value, 4> operands;
+
   // Construct the while-loop with a parameter for each index.
-  Type indexType = builder.getIndexType();
-  for (unsigned b = 0, be = indices.size(); b < be; b++) {
-    if (!indices[b])
-      continue;
-    if (isCompressedDLT(merger.getDimLevelType(b)) ||
-        isSingletonDLT(merger.getDimLevelType(b))) {
-      unsigned tensor = merger.tensor(b);
-      assert(idx == merger.index(b));
-      types.push_back(indexType);
-      operands.push_back(codegen.pidxs[tensor][idx]);
-    } else {
-      assert(isDenseDLT(merger.getDimLevelType(b)) ||
-             isUndefDLT(merger.getDimLevelType(b)));
-    }
-  }
-  if (codegen.redVal) {
-    types.push_back(codegen.redVal.getType());
+  if (codegen.redVal)
     operands.push_back(codegen.redVal);
-  }
-  if (codegen.expValues) {
-    types.push_back(indexType);
+  if (codegen.expValues)
     operands.push_back(codegen.expCount);
-  }
-  if (needsUniv) {
-    types.push_back(indexType);
-    operands.push_back(codegen.loops[idx]);
-  }
-  assert(types.size() == operands.size());
-  Location loc = op.getLoc();
-  scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands);
 
-  SmallVector<Location> locs(types.size(), loc);
-  Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
-  Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
+  Operation *loop = codegen.loopEmitter.enterCoiterationOverTensorsAtDims(
+      builder, op.getLoc(), condTids, condDims, needsUniv, operands, extraTids,
+      extraDims);
 
-  // Build the "before" region, which effectively consists
-  // of a conjunction of "i < upper" tests on all induction.
-  builder.setInsertionPointToStart(&whileOp.getBefore().front());
-  Value cond;
-  unsigned o = 0;
-  for (unsigned b = 0, be = indices.size(); b < be; b++) {
-    if (!indices[b])
-      continue;
-    if (isCompressedDLT(merger.getDimLevelType(b)) ||
-        isSingletonDLT(merger.getDimLevelType(b))) {
-      unsigned tensor = merger.tensor(b);
-      assert(idx == merger.index(b));
-      Value op1 = before->getArgument(o);
-      Value op2 = codegen.highs[tensor][idx];
-      Value opc = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
-                                                op1, op2);
-      cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc;
-      codegen.pidxs[tensor][idx] = after->getArgument(o++);
-    } else {
-      assert(isDenseDLT(merger.getDimLevelType(b)) ||
-             isUndefDLT(merger.getDimLevelType(b)));
-    }
-  }
   if (codegen.redVal)
-    updateReduc(merger, codegen, after->getArgument(o++));
+    updateReduc(merger, codegen, operands.front());
   if (codegen.expValues)
-    codegen.expCount = after->getArgument(o++);
-  if (needsUniv)
-    codegen.loops[idx] = after->getArgument(o++);
-  assert(o == operands.size());
-  builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
-  builder.setInsertionPointToStart(&whileOp.getAfter().front());
-  return whileOp;
+    codegen.expCount = operands.back();
+
+  return loop;
 }
 
 /// Generates a for-loop or a while-loop, depending on whether it implements
 /// singleton iteration or co-iteration over the given conjunction.
 static Operation *genLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                           linalg::GenericOp op, unsigned at, bool needsUniv,
-                          BitVector &indices) {
+                          ArrayRef<size_t> condTids, ArrayRef<size_t> condDims,
+                          ArrayRef<size_t> extraTids,
+                          ArrayRef<size_t> extraDims) {
+  assert(condTids.size() == condDims.size());
+  assert(extraTids.size() == extraDims.size());
   unsigned idx = codegen.topSort[at];
-  if (indices.count() == 1) {
+  if (condTids.size() == 1) {
     bool isOuter = at == 0;
     bool isInner = at == codegen.topSort.size() - 1;
-    return genFor(merger, codegen, builder, op, isOuter, isInner, idx, indices);
-  }
-  return genWhile(merger, codegen, builder, op, idx, needsUniv, indices);
-}
-
-/// Generates the local variables for this loop, consisting of the sparse
-/// indices, restored universal dense index, and dense positions.
-static void genLocals(Merger &merger, CodeGen &codegen, OpBuilder &builder,
-                      linalg::GenericOp op, unsigned at, bool needsUniv,
-                      BitVector &locals) {
-  std::vector<unsigned> &topSort(codegen.topSort);
-  Location loc = op.getLoc();
-  unsigned idx = topSort[at];
-
-  // Initialize sparse indices.
-  Value min;
-  for (unsigned b = 0, be = locals.size(); b < be; b++) {
-    if (!locals[b])
-      continue;
-    if (isCompressedDLT(merger.getDimLevelType(b)) ||
-        isSingletonDLT(merger.getDimLevelType(b))) {
-      unsigned tensor = merger.tensor(b);
-      assert(idx == merger.index(b));
-      Value ptr = codegen.indices[tensor][idx];
-      Value s = codegen.pidxs[tensor][idx];
-      Value load = genLoad(codegen, builder, loc, ptr, s);
-      codegen.idxs[tensor][idx] = load;
-      if (!needsUniv) {
-        if (min) {
-          Value cmp = builder.create<arith::CmpIOp>(
-              loc, arith::CmpIPredicate::ult, load, min);
-          min = builder.create<arith::SelectOp>(loc, cmp, load, min);
-        } else {
-          min = load;
-        }
-      }
-    } else {
-      assert(isDenseDLT(merger.getDimLevelType(b)) ||
-             isUndefDLT(merger.getDimLevelType(b)));
-    }
-  }
-
-  // Merge dense universal index over minimum.
-  if (min) {
-    assert(!needsUniv);
-    codegen.loops[idx] = min;
-  }
-
-  // Initialize dense positions. Note that we generate dense indices of the
-  // output tensor unconditionally, since they may not appear in the lattice,
-  // but may be needed for linearized codegen.
-  for (unsigned b = 0, be = locals.size(); b < be; b++) {
-    if ((locals[b] || merger.isOutTensor(b, idx)) &&
-        isDenseDLT(merger.getDimLevelType(b))) {
-      unsigned tensor = merger.tensor(b);
-      assert(idx == merger.index(b));
-      unsigned pat = at;
-      for (; pat != 0; pat--)
-        if (codegen.pidxs[tensor][topSort[pat - 1]])
-          break;
-      Value p = (pat == 0) ? constantIndex(builder, loc, 0)
-                           : codegen.pidxs[tensor][topSort[pat - 1]];
-      codegen.pidxs[tensor][idx] = genAddress(
-          codegen, builder, loc, codegen.sizes[idx], p, codegen.loops[idx]);
-    }
+    return genFor(merger, codegen, builder, op, isOuter, isInner, idx,
+                  condTids.front(), condDims.front(), extraTids, extraDims);
   }
+  return genWhile(merger, codegen, builder, op, idx, needsUniv, condTids,
+                  condDims, extraTids, extraDims);
 }
 
 /// Generates the induction structure for a while-loop.
-static void genWhileInduction(Merger &merger, CodeGen &codegen,
-                              OpBuilder &builder, linalg::GenericOp op,
-                              unsigned idx, bool needsUniv,
-                              BitVector &induction, scf::WhileOp whileOp) {
+static void finalizeWhileOp(Merger &merger, CodeGen &codegen,
+                            OpBuilder &builder, linalg::GenericOp op,
+                            unsigned idx, bool needsUniv, BitVector &induction,
+                            scf::WhileOp whileOp) {
   Location loc = op.getLoc();
   // Finalize each else branch of all if statements.
   if (codegen.redVal || codegen.expValues) {
@@ -1251,71 +1006,6 @@
     }
   }
   builder.setInsertionPointToEnd(&whileOp.getAfter().front());
-  // Finalize the induction. Note that the induction could be performed
-  // in the individual if-branches to avoid re-evaluating the conditions.
-  // However, that would result in a rather elaborate forest of yield
-  // instructions during code generation. Moreover, performing the induction
-  // after the if-statements more closely resembles code generated by TACO.
-  unsigned o = 0;
-  SmallVector<Value, 4> operands;
-  Value one = constantIndex(builder, loc, 1);
-  for (unsigned b = 0, be = induction.size(); b < be; b++) {
-    if (!induction[b])
-      continue;
-    if (isCompressedDLT(merger.getDimLevelType(b)) ||
-        isSingletonDLT(merger.getDimLevelType(b))) {
-      unsigned tensor = merger.tensor(b);
-      assert(idx == merger.index(b));
-      Value op1 = codegen.idxs[tensor][idx];
-      Value op2 = codegen.loops[idx];
-      Value op3 = codegen.pidxs[tensor][idx];
-      Value cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                                op1, op2);
-      Value add = builder.create<arith::AddIOp>(loc, op3, one);
-      operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, op3));
-      codegen.pidxs[tensor][idx] = whileOp->getResult(o++);
-    } else {
-      assert(isDenseDLT(merger.getDimLevelType(b)) ||
-             isUndefDLT(merger.getDimLevelType(b)));
-    }
-  }
-  if (codegen.redVal) {
-    operands.push_back(codegen.redVal);
-    updateReduc(merger, codegen, whileOp->getResult(o++));
-  }
-  if (codegen.expValues) {
-    operands.push_back(codegen.expCount);
-    codegen.expCount = whileOp->getResult(o++);
-  }
-  if (needsUniv) {
-    operands.push_back(
-        builder.create<arith::AddIOp>(loc, codegen.loops[idx], one));
-    codegen.loops[idx] = whileOp->getResult(o++);
-  }
-  assert(o == operands.size());
-  builder.create<scf::YieldOp>(loc, operands);
-  builder.setInsertionPointAfter(whileOp);
-}
-
-/// Generates the induction structure for a for-loop.
-static void genForInduction(Merger &merger, CodeGen &codegen,
-                            OpBuilder &builder, linalg::GenericOp op,
-                            Operation *loop) {
-  Location loc = op.getLoc();
-  unsigned o = 0;
-  SmallVector<Value, 4> operands;
-  if (codegen.redVal) {
-    operands.push_back(codegen.redVal);
-    updateReduc(merger, codegen, loop->getResult(o++));
-  }
-  if (codegen.expValues) {
-    operands.push_back(codegen.expCount);
-    codegen.expCount = loop->getResult(o++);
-  }
-  assert(o == operands.size());
-  if (o > 0)
-    builder.create<scf::YieldOp>(loc, operands);
-  builder.setInsertionPointAfter(loop);
 }
 
 /// Generates a single if-statement within a while-loop.
@@ -1333,8 +1023,10 @@
     Value clause;
     if (isCompressedDLT(merger.getDimLevelType(b)) ||
         isSingletonDLT(merger.getDimLevelType(b))) {
-      Value op1 = codegen.idxs[tensor][idx];
-      Value op2 = codegen.loops[idx];
+      auto dim = codegen.loopIdxToDim[tensor][idx];
+      assert(dim != INVALID_ID);
+      Value op1 = codegen.loopEmitter.getCoord()[tensor][dim];
+      Value op2 = codegen.getLoopIdxValue(idx);
       clause = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, op1,
                                              op2);
     } else {
@@ -1380,15 +1072,33 @@
 static bool startLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                          linalg::GenericOp op, unsigned exp, unsigned at,
                          unsigned idx, unsigned ldx, unsigned lts) {
-  assert(!codegen.loops[idx]);
+  assert(!codegen.getLoopIdxValue(idx));
   // Emit invariants at this loop sequence level.
   genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/true);
   // Emit access pattern expansion for sparse tensor output.
   genExpansion(merger, codegen, builder, op, at, /*atStart=*/true);
   // Emit further intitialization at this loop sequence level.
   unsigned l0 = merger.set(lts)[0];
-  bool needsUniv =
-      genInit(merger, codegen, builder, op, at, merger.lat(l0).bits);
+  bool needsUniv = false;
+
+  SmallVector<size_t, 4> ts;
+  SmallVector<size_t, 4> ds;
+  for (auto b : merger.lat(l0).bits.set_bits()) {
+    if (isDenseDLT(merger.getDimLevelType(b)) ||
+        isUndefDLT(merger.getDimLevelType(b))) {
+      needsUniv = true;
+    } else {
+      unsigned tensor = merger.tensor(b);
+      assert(idx == merger.index(b));
+      size_t dim = codegen.loopIdxToDim[tensor][idx];
+      assert(dim != INVALID_ID);
+      ts.push_back(tensor);
+      ds.push_back(dim);
+    }
+  }
+
+  codegen.loopEmitter.enterNewLoopSeq(builder, op.getLoc(), ts, ds);
+
   // Maintain the universal index only if it is actually
   // consumed by a subsequent lattice point.
   if (needsUniv) {
@@ -1406,11 +1116,57 @@
 static Operation *startLoop(Merger &merger, CodeGen &codegen,
                             OpBuilder &builder, linalg::GenericOp op,
                             unsigned at, unsigned li, bool needsUniv) {
+  const BitVector &simple = merger.lat(li).simple;
+  const BitVector &all = merger.lat(li).bits;
+  assert(simple.size() == all.size());
+  // The set of tensors + dims to generate loops on
+  SmallVector<size_t, 4> condTids, condDims;
+  // The set of (dense) tensors that is optimized from condition, yet still
+  // need extra locals to iterate on them.
+  SmallVector<size_t, 4> extraTids, extraDims;
+  // First converts bits to array + dim pair
+  for (unsigned b = 0, e = simple.size(); b < e; b++) {
+    size_t tid = merger.tensor(b);
+    size_t idx = codegen.topSort[at];
+    if (simple.test(b)) {
+      // the simplified condition must be a subset of the original condition.
+      assert(all.test(b));
+      assert(merger.index(b) == idx);
+      if (isUndefDLT(merger.getDimLevelType(b))) {
+        // This could be a synthetic tensor (for invariants and sparse output
+        // tensor).
+        // In both cases, we mean to generate loops over output tensor.
+        // e.g.,
+        // out[i][j] = invariant;
+        if (merger.getSynTensorID() == tid)
+          tid = merger.getOutTensorID();
+      }
+      auto dim = codegen.loopIdxToDim[tid][idx];
+      if (dim != INVALID_ID) {
+        // dim could be invalid if this is a zero ranked tensor
+        condTids.push_back(tid);
+        condDims.push_back(dim);
+      }
+    } else if ((all.test(b) || merger.isOutTensor(b, idx)) &&
+               isDenseDLT(merger.getDimLevelType(b))) {
+      // Note that we generate dense indices of the output tensor
+      // unconditionally, since they may not appear in the lattice, but may be
+      // needed for linearized codegen.
+      assert(merger.index(b) == idx);
+      // Only dense dimensions should be optimized from conditions.
+      assert(isDenseDLT(merger.getDimLevelType(b)));
+      auto dim = codegen.loopIdxToDim[tid][idx];
+      assert(dim != INVALID_ID);
+      extraTids.push_back(tid);
+      extraDims.push_back(dim);
+    }
+  }
   // Emit the for/while-loop control.
   Operation *loop = genLoop(merger, codegen, builder, op, at, needsUniv,
-                            merger.lat(li).simple);
+                            condTids, condDims, extraTids, extraDims);
   // Emit the locals for this loop.
-  genLocals(merger, codegen, builder, op, at, needsUniv, merger.lat(li).bits);
+  // genLocals(merger, codegen, builder, op, at, needsUniv,
+  // merger.lat(li).bits);
   return loop;
 }
 
@@ -1420,21 +1176,36 @@
                     unsigned li, bool needsUniv) {
   // End a while-loop.
   if (auto whileOp = dyn_cast<scf::WhileOp>(loop)) {
-    genWhileInduction(merger, codegen, builder, op, idx, needsUniv,
-                      merger.lat(li).bits, whileOp);
-    return needsUniv;
+    finalizeWhileOp(merger, codegen, builder, op, idx, needsUniv,
+                    merger.lat(li).bits, whileOp);
+  } else {
+    needsUniv = false;
   }
-  // End a for-loop.
-  genForInduction(merger, codegen, builder, op, loop);
-  return false;
+
+  SmallVector<Value, 2> reduc;
+  if (codegen.redVal)
+    reduc.push_back(codegen.redVal);
+  if (codegen.expValues)
+    reduc.push_back(codegen.expCount);
+
+  auto loopRet =
+      codegen.loopEmitter.exitCurrentLoop(builder, op.getLoc(), reduc);
+  assert(reduc.size() == loopRet.size());
+
+  if (codegen.redVal)
+    updateReduc(merger, codegen, loopRet.front());
+  if (codegen.expValues)
+    codegen.expCount = loopRet.back();
+
+  return needsUniv;
 }
 
 /// Ends a loop sequence at given level.
 static void endLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                        linalg::GenericOp op, unsigned exp, unsigned at,
                        unsigned idx, unsigned ldx) {
-  assert(codegen.loops[idx]);
-  codegen.loops[idx] = Value();
+  assert(codegen.getLoopIdxValue(idx) == nullptr);
+  codegen.loopEmitter.exitCurrentLoopSeq();
   // Unmark bookkeeping of invariants and loop index.
   genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/false);
   // Finalize access pattern expansion for sparse tensor output.
@@ -1514,7 +1285,7 @@
   } else {
     // To rematerialize an non-annotated tensor, simply load it
     // from the bufferized value.
-    Value val = codegen.buffers.back(); // value array
+    Value val = codegen.loopEmitter.getValBuffer().back(); // value array
     rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, resType, val);
   }
 }
@@ -1582,10 +1353,17 @@
       // Inadmissible expression, reject.
       return failure();
 
-    // Recursively generates code if admissible.
     merger.setHasSparseOut(sparseOut != nullptr);
-    CodeGen codegen(options, numTensors, numLoops, sparseOut, outerParNest,
-                    topSort);
+
+    SmallVector<Value, 4> tensors;
+    for (OpOperand &t : op->getOpOperands())
+      tensors.push_back(t.get());
+
+    // Recursively generates code if admissible.
+    CodeGen codegen(options, tensors, numTensors, numLoops, sparseOut,
+                    outerParNest, topSort);
+    // TODO: maybe merger should be responsible of maintaining the map.
+    codegen.buildLoopIdxToDimMap(op);
     genBuffers(merger, codegen, rewriter, op);
     genStmt(merger, codegen, rewriter, op, exp, 0);
     genResult(merger, codegen, rewriter, op);
diff --git a/mlir/test/Dialect/SparseTensor/sorted_coo.mlir b/mlir/test/Dialect/SparseTensor/sorted_coo.mlir
--- a/mlir/test/Dialect/SparseTensor/sorted_coo.mlir
+++ b/mlir/test/Dialect/SparseTensor/sorted_coo.mlir
@@ -74,10 +74,10 @@
 // CHECK-DAG:     %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xf64>
-// CHECK:         %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK:         %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
-// CHECK:         %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:         %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
+// CHECK-DAG:     %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-DAG:     %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:         scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] {
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_13]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_14]]] : memref<32xf64>
@@ -120,12 +120,12 @@
 // CHECK-DAG:     %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xf64>
-// CHECK:         %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x64xf64>
-// CHECK:         linalg.fill ins(%[[VAL_3]] : f64) outs(%[[VAL_14]] : memref<32x64xf64>)
-// CHECK:         %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:         %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:         %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:         %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x64xf64>
+// CHECK-DAG:     linalg.fill ins(%[[VAL_3]] : f64) outs(%[[VAL_14]] : memref<32x64xf64>)
+// CHECK-DAG:     %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:         %[[VAL_19:.*]]:2 = scf.while (%[[VAL_20:.*]] = %[[VAL_15]], %[[VAL_21:.*]] = %[[VAL_17]]) : (index, index) -> (index, index) {
 // CHECK:           %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_16]] : index
 // CHECK:           %[[VAL_23:.*]] = arith.cmpi ult, %[[VAL_21]], %[[VAL_18]] : index
diff --git a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
@@ -113,10 +113,10 @@
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32xf32>)
-// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32xf32>)
 // CHECK:           %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_17:.*]] = arith.cmpi ult, %[[VAL_15]], %[[VAL_13]] : index
 // CHECK:             scf.condition(%[[VAL_17]]) %[[VAL_15]], %[[VAL_16]] : index, index
@@ -166,9 +166,9 @@
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_8]] : memref<32xf32>)
-// CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_8]] : memref<32xf32>)
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_3]] {
 // CHECK:             %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref<?xindex>
 // CHECK:             %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_11]]] : memref<?xf32>
@@ -206,9 +206,9 @@
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>)
-// CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_4]] {
 // CHECK:             %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
 // CHECK:             %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_12]]] : memref<?xf32>
@@ -314,9 +314,9 @@
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_18:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_14]] : index
 // CHECK:             scf.condition(%[[VAL_18]]) %[[VAL_16]], %[[VAL_17]] : index, index
@@ -371,9 +371,9 @@
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
-// CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] {
 // CHECK:             %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref<?xindex>
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_14]]] : memref<32xf32>
@@ -408,9 +408,9 @@
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_18:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_14]] : index
 // CHECK:             scf.condition(%[[VAL_18]]) %[[VAL_16]], %[[VAL_17]] : index, index
@@ -465,9 +465,9 @@
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
-// CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] {
 // CHECK:             %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_13]]] : memref<?xindex>
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref<?xf32>
@@ -502,11 +502,11 @@
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_13]], %[[VAL_19:.*]] = %[[VAL_15]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_20:.*]] = arith.cmpi ult, %[[VAL_18]], %[[VAL_14]] : index
 // CHECK:             %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_16]] : index
@@ -585,11 +585,11 @@
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_13]], %[[VAL_19:.*]] = %[[VAL_15]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_20:.*]] = arith.cmpi ult, %[[VAL_18]], %[[VAL_14]] : index
 // CHECK:             %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_16]] : index
@@ -647,11 +647,11 @@
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]]:2 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index
 // CHECK:             %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index
@@ -740,11 +740,11 @@
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]]:2 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index
 // CHECK:             %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index
@@ -881,11 +881,11 @@
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_11]][] : memref<f32>
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:           %[[VAL_13:.*]] = memref.load %[[VAL_11]][] : memref<f32>
+// CHECK-DAG:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:           %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:           %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]]:3 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]], %[[VAL_21:.*]] = %[[VAL_13]]) : (index, index, f32) -> (index, index, f32) {
 // CHECK:             %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index
 // CHECK:             %[[VAL_23:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index
@@ -989,12 +989,12 @@
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_2]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f32>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_13]][] : memref<f32>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_9]][] : memref<f32>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.load %[[VAL_13]][] : memref<f32>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_9]][] : memref<f32>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_21:.*]]:3 = scf.while (%[[VAL_22:.*]] = %[[VAL_17]], %[[VAL_23:.*]] = %[[VAL_19]], %[[VAL_24:.*]] = %[[VAL_15]]) : (index, index, f32) -> (index, index, f32) {
 // CHECK:             %[[VAL_25:.*]] = arith.cmpi ult, %[[VAL_22]], %[[VAL_18]] : index
 // CHECK:             %[[VAL_26:.*]] = arith.cmpi ult, %[[VAL_23]], %[[VAL_20]] : index
@@ -1106,13 +1106,13 @@
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.pointers %[[VAL_3]] {dimension = 0 : index} : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.indices %[[VAL_3]] {dimension = 0 : index} : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.values %[[VAL_3]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf64>
-// CHECK-DAG:       %[[VAL_16:.*]] = tensor.dim %[[VAL_4]], %[[VAL_5]] : tensor<?xf64>
+// CHECK-DAG:       %[[VAL_16:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?xf64>
 // CHECK-DAG:       %[[VAL_18:.*]] = bufferization.to_memref %[[VAL_4]]
-// CHECK:           linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_18]] : memref<?xf64>)
-// CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
-// CHECK:           %[[VAL_21:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_22:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_18]] : memref<?xf64>)
+// CHECK-DAG:       %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_21:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_22:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_7]]] : memref<?xindex>
 // CHECK:           %[[VAL_23:.*]]:3 = scf.while (%[[VAL_24:.*]] = %[[VAL_19]], %[[VAL_25:.*]] = %[[VAL_21]], %[[VAL_26:.*]] = %[[VAL_5]]) : (index, index, index) -> (index, index, index) {
 // CHECK:             %[[VAL_27:.*]] = arith.cmpi ult, %[[VAL_24]], %[[VAL_20]] : index
 // CHECK:             %[[VAL_28:.*]] = arith.cmpi ult, %[[VAL_25]], %[[VAL_22]] : index
@@ -1284,13 +1284,13 @@
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_2]] {dimension = 0 : index} : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf64>
 // CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f64>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_15]][] : memref<f64>
-// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_22:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_23:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_15]][] : memref<f64>
+// CHECK-DAG:       %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_22:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_23:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_24:.*]]:4 = scf.while (%[[VAL_25:.*]] = %[[VAL_18]], %[[VAL_26:.*]] = %[[VAL_20]], %[[VAL_27:.*]] = %[[VAL_22]], %[[VAL_28:.*]] = %[[VAL_17]]) : (index, index, index, f64) -> (index, index, index, f64) {
 // CHECK:             %[[VAL_29:.*]] = arith.cmpi ult, %[[VAL_25]], %[[VAL_19]] : index
 // CHECK:             %[[VAL_30:.*]] = arith.cmpi ult, %[[VAL_26]], %[[VAL_21]] : index
diff --git a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
@@ -962,7 +962,7 @@
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xf64>
-// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?xf64>
+// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64
 // CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf64>
 // CHECK:           linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_11]] : memref<?x?xf64>)
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
@@ -1015,7 +1015,7 @@
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = tensor.dim %[[VAL_2]], %[[VAL_4]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?x?xf32>
 // CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
 // CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -1095,7 +1095,7 @@
 // CHECK-DAG:       %[[VAL_19:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?xf32>
 // CHECK-DAG:       %[[VAL_21:.*]] = bufferization.to_memref %[[VAL_4]] : memref<f32>
-// CHECK-DAG:       %[[VAL_22:.*]] = tensor.dim %[[VAL_5]], %[[VAL_6]] : tensor<?xf32>
+// CHECK-DAG:       %[[VAL_22:.*]] = tensor.dim %[[VAL_2]], %[[VAL_6]] : tensor<?x?xf32,
 // CHECK-DAG:       %[[VAL_24:.*]] = bufferization.to_memref %[[VAL_5]] : memref<?xf32>
 // CHECK:           %[[VAL_25:.*]] = memref.load %[[VAL_21]][] : memref<f32>
 // CHECK:           %[[VAL_26:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_3d.mlir b/mlir/test/Dialect/SparseTensor/sparse_3d.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_3d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_3d.mlir
@@ -1126,11 +1126,11 @@
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 2 : index} : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 2 : index} : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ] }>> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = tensor.dim %[[VAL_2]], %[[VAL_5]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = tensor.dim %[[VAL_1]], %[[VAL_6]] : tensor<?x?x?xf32
 // CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?x?xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = tensor.dim %[[VAL_0]], %[[VAL_6]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?x?x?xf32
+// CHECK-DAG:       %[[VAL_14:.*]] = tensor.dim %[[VAL_2]], %[[VAL_6]] : tensor<?x?xf32>
 // CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?xf32>
 // CHECK:           scf.for %[[VAL_17:.*]] = %[[VAL_5]] to %[[VAL_13]] step %[[VAL_6]] {
 // CHECK:             scf.for %[[VAL_18:.*]] = %[[VAL_5]] to %[[VAL_10]] step %[[VAL_6]] {
@@ -1247,7 +1247,7 @@
 // CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32>
 // CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32>
 // CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?x?xf32>
-// CHECK-DAG:       %[[VAL_9:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_9:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?x?x?xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_12]][] : memref<f32>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir b/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir
@@ -20,8 +20,8 @@
 //       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
 //       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]] {
 //       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
 //       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
@@ -38,8 +38,8 @@
 //       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
 //       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]] {
 //       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
 //       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
@@ -57,8 +57,8 @@
 //       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
 //       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]] {
 //       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
 //       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_index.mlir b/mlir/test/Dialect/SparseTensor/sparse_index.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_index.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_index.mlir
@@ -25,14 +25,15 @@
 // CHECK-DAG:       %[[VAL_4:.*]] = tensor.dim %[[VAL_0]], %[[VAL_1]] : tensor<?x?xi64, #sparse_tensor.encoding
 // CHECK-DAG:       %[[VAL_5:.*]] = bufferization.alloc_tensor(%[[VAL_3]], %[[VAL_4]]) : tensor<?x?xi64, #sparse_tensor.encoding
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xi64, #sparse_tensor.encoding
-// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_5]], %[[VAL_1]] : tensor<?x?xi64, #sparse_tensor.encoding
-// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_5]], %[[VAL_2]] : tensor<?x?xi64, #sparse_tensor.encoding
+// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_1]] : tensor<?x?xi64, #sparse_tensor.encoding
+// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?xi64, #sparse_tensor.encoding
+// CHECK-DAG:       %[[VAL_24:.*]] = tensor.dim %[[VAL_5]], %[[VAL_2]] : tensor<?x?xi64, #sparse_tensor.encoding
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_5]] : tensor<?x?xi64, #sparse_tensor.encoding
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_1]] to %[[VAL_7]] step %[[VAL_2]] {
 // CHECK:             scf.for %[[VAL_11:.*]] = %[[VAL_1]] to %[[VAL_8]] step %[[VAL_2]] {
 // CHECK:               %[[VAL_12:.*]] = arith.muli %[[VAL_8]], %[[VAL_10]] : index
 // CHECK:               %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[VAL_11]] : index
-// CHECK:               %[[VAL_14:.*]] = arith.muli %[[VAL_8]], %[[VAL_10]] : index
+// CHECK:               %[[VAL_14:.*]] = arith.muli %[[VAL_24]], %[[VAL_10]] : index
 // CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_11]] : index
 // CHECK:               %[[VAL_16:.*]] = arith.index_cast %[[VAL_11]] : index to i64
 // CHECK:               %[[VAL_17:.*]] = arith.index_cast %[[VAL_10]] : index to i64
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
@@ -36,14 +36,14 @@
 // CHECK-HIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
 // CHECK-HIR-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
 // CHECK-HIR:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
-// CHECK-HIR:             %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64>
-// CHECK-HIR:             %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
-// CHECK-HIR:             %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index
-// CHECK-HIR:             %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK-HIR-DAG:         %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64>
+// CHECK-HIR-DAG:         %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
+// CHECK-HIR-DAG:         %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index
+// CHECK-HIR-DAG:         %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_15]]] : memref<?xindex>
 // CHECK-HIR:             scf.for %[[VAL_17:.*]] = %[[VAL_14]] to %[[VAL_16]] step %[[VAL_5]] {
-// CHECK-HIR:               %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref<?xindex>
-// CHECK-HIR:               %[[VAL_19:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64>
-// CHECK-HIR:               %[[VAL_20:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_17]]] : memref<?xf64>
+// CHECK-HIR-DAG:           %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref<?xindex>
+// CHECK-HIR-DAG:           %[[VAL_19:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64>
+// CHECK-HIR-DAG:           %[[VAL_20:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_17]]] : memref<?xf64>
 // CHECK-HIR:               %[[VAL_21:.*]] = arith.mulf %[[VAL_20]], %[[VAL_13]] : f64
 // CHECK-HIR:               %[[VAL_22:.*]] = arith.addf %[[VAL_19]], %[[VAL_21]] : f64
 // CHECK-HIR:               memref.store %[[VAL_22]], %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir
@@ -60,17 +60,17 @@
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?x?xf32>
-// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?x?xf32>
-// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_2]] : tensor<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?x?xf32
+// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32
+// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32
 // CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?x?xf32>
 // CHECK:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_10]] : memref<?x?x?xf32>)
-// CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_4]] {
-// CHECK:             scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
-// CHECK:               %[[VAL_13:.*]] = arith.muli %[[VAL_8]], %[[VAL_11]] : index
+// CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_4]] {
+// CHECK:             scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_4]] {
+// CHECK:               %[[VAL_13:.*]] = arith.muli %[[VAL_7]], %[[VAL_11]] : index
 // CHECK:               %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : index
-// CHECK:               scf.for %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_4]] {
-// CHECK:                 %[[VAL_16:.*]] = arith.muli %[[VAL_6]], %[[VAL_14]] : index
+// CHECK:               scf.for %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
+// CHECK:                 %[[VAL_16:.*]] = arith.muli %[[VAL_8]], %[[VAL_14]] : index
 // CHECK:                 %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_15]] : index
 // CHECK:                 %[[VAL_18:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_17]]] : memref<?xf32>
 // CHECK:                 memref.store %[[VAL_18]], %[[VAL_10]]{{\[}}%[[VAL_15]], %[[VAL_11]], %[[VAL_12]]] : memref<?x?x?xf32>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
@@ -117,8 +117,8 @@
 // CHECK-RWT:          %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
 // CHECK-RWT:          scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] {
 // CHECK-RWT:            %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT:            %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
 // CHECK-RWT:            %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT:            %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
 // CHECK-RWT:            %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
 // CHECK-RWT:            scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] {
 // CHECK-RWT:              %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
@@ -266,8 +266,8 @@
 // CHECK-RWT:          %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
 // CHECK-RWT:          scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] {
 // CHECK-RWT:            %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT:            %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
 // CHECK-RWT:            %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT:            %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
 // CHECK-RWT:            %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
 // CHECK-RWT:            scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] {
 // CHECK-RWT:              %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir b/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir
@@ -27,17 +27,17 @@
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 2.200000e+00 : f32
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_8:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : f32
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xf32>
-// CHECK:           %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
-// CHECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_4]] : memref<32x16xf32>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_14]][] : memref<f32>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
-// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : f32
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_4]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_14]][] : memref<f32>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_19:.*]] = %[[VAL_17]] to %[[VAL_18]] step %[[VAL_7]] {
 // CHECK:             %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_19]]] : memref<?xindex>
 // CHECK:             %[[VAL_21:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_19]]] : memref<?xindex>