diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -153,11 +153,15 @@
   /// last tensor in this set denoting the output tensor. The merger adds an
   /// additional synthetic tensor at the end of this set to represent all
   /// invariant expressions in the kernel.
-  Merger(unsigned t, unsigned l)
-      : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1), numLoops(l),
-        hasSparseOut(false),
-        dimTypes(t + 1, std::vector<DimLevelType>(l, DimLevelType::Undef)),
-        loopIdxToDim(t + 1, std::vector<Optional<unsigned>>(l, llvm::None)) {}
+  Merger(unsigned t, unsigned l, unsigned fl)
+      : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1),
+        numNativeLoops(l), numLoops(l + fl), hasSparseOut(false),
+        dimTypes(numTensors,
+                 std::vector<DimLevelType>(numLoops, DimLevelType::Undef)),
+        loopIdxToDim(numTensors,
+                     std::vector<Optional<unsigned>>(numLoops, llvm::None)),
+        dimToLoopIdx(numTensors,
+                     std::vector<Optional<unsigned>>(numLoops, llvm::None)) {}
 
   /// Adds a tensor expression. Returns its index.
   unsigned addExp(Kind k, unsigned e0, unsigned e1 = -1u, Value v = Value(),
@@ -227,6 +231,15 @@
   /// Bit translation (get loop index).
   unsigned index(unsigned b) const { return b / numTensors; }
 
+  /// Get the number of total loops (native loops + filter loops).
+  unsigned getNumLoops() const { return numLoops; }
+  /// Get the number of native loops.
+  unsigned getNumNativeLoops() const { return numNativeLoops; }
+  /// Get the number of filter loops.
+  unsigned getNumFilterLoops() const { return numLoops - numNativeLoops; }
+  /// Get the starting filter loop index.
+  unsigned getFilterLoopStartingIdx() const { return getNumNativeLoops(); }
+
   /// Returns true if bit corresponds to index of output tensor.
   bool isOutTensor(unsigned b, unsigned i) const {
     return tensor(b) == outTensor && index(b) == i;
@@ -238,6 +251,11 @@
   /// expressions).
   unsigned getSynTensorID() const { return syntheticTensor; }
 
+  bool isFilterLoop(unsigned ldx) const {
+    assert(ldx < numLoops);
+    return ldx >= numNativeLoops;
+  }
+
   /// Returns true if given tensor iterates *only* in the given tensor
   /// expression. For the output tensor, this defines a "simply dynamic"
   /// operation [Bik96]. For instance: a(i) *= 2.0 or a(i) += a(i) for
@@ -258,6 +276,11 @@
     return getDimLevelType(tensor(b), index(b));
   }
 
+  Optional<unsigned> getLoopIdx(unsigned t, unsigned dim) const {
+    assert(t < numTensors && dim < numLoops);
+    return dimToLoopIdx[t][dim];
+  }
+
   /// Gets the dimension number of the the `t`th tensor on `i`th loop.
   Optional<unsigned> getDimNum(unsigned t, unsigned i) const {
     assert(t < numTensors && i < numLoops);
@@ -276,6 +299,8 @@
     assert(isValidDLT(dlt));
     dimTypes[t][i] = dlt;
     loopIdxToDim[t][i] = dim;
+    assert(dim < numLoops);
+    dimToLoopIdx[t][dim] = i;
   }
 
   // Iterates the bits of a lattice, for each set bit, converts it into the
@@ -334,6 +359,7 @@
   const unsigned outTensor;
   const unsigned syntheticTensor;
   const unsigned numTensors;
+  const unsigned numNativeLoops;
   const unsigned numLoops;
   bool hasSparseOut;
   // Map that converts pair<tensor id, loop id> to the corresponding dimension
@@ -341,6 +367,8 @@
   std::vector<std::vector<DimLevelType>> dimTypes;
   // Map that converts pair<tensor id, loop id> to the corresponding dimension.
   std::vector<std::vector<Optional<unsigned>>> loopIdxToDim;
+  // Map that converts pair<tensor id, dim> to the corresponding loop id.
+  std::vector<std::vector<Optional<unsigned>>> dimToLoopIdx;
   llvm::SmallVector<TensorExp, 32> tensorExps;
   llvm::SmallVector<LatPoint, 16> latPoints;
   llvm::SmallVector<SmallVector<unsigned, 16>, 8> latSets;
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -318,7 +318,6 @@
 // loopEmiter.exitCurrentLoop(); // exit i
 //===----------------------------------------------------------------------===//
 
-// TODO: Sparsification should also rely on this class to generate loops.
 class SparseTensorLoopEmitter {
 public:
   /// Optional callback function to setup dense output tensors when
@@ -332,13 +331,17 @@
   /// If isSparseOut is set, loop emitter assume that the sparse output tensor
   /// is empty, and will always generate loops on it based on the dim sizes.
   explicit SparseTensorLoopEmitter(ValueRange tensors, bool hasOutput = false,
-                                   bool isSparseOut = false);
+                                   bool isSparseOut = false,
+                                   ArrayRef<unsigned> topSort = {});
 
   /// Starts a loop emitting session by generating all the buffers needed to
   /// iterate tensors.
   void initializeLoopEmit(OpBuilder &builder, Location loc,
                           OutputUpdater updater = nullptr);
 
+  /// Generates a list of operations to compute the affine expression.
+  Value genAffine(OpBuilder &builder, AffineExpr a, Location loc);
+
   /// Enters a new loop sequence, the loops within the same sequence starts from
   /// the break points of previous loop instead of starting over from 0.
   /// e.g.,
@@ -377,6 +380,15 @@
                                       ArrayRef<size_t> extraTids = {},
                                       ArrayRef<size_t> extraDims = {});
 
+  Operation *enterFilterLoopOverTensorAtDim(OpBuilder &builder, Location loc,
+                                            size_t tid, size_t dim,
+                                            AffineExpr affine,
+                                            MutableArrayRef<Value> reduc = {});
+
+  void genDenseAffineAddressAtCurLevel(OpBuilder &builder, Location loc,
+                                       size_t tid, size_t dim,
+                                       AffineExpr affine);
+
   /// Emits a co-iteration loop over a set of tensors.
   Operation *enterCoIterationOverTensorsAtDims(
       OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
@@ -392,6 +404,9 @@
       coords.push_back(l.iv);
   }
 
+  /// Gets loop induction variable at the given level.
+  unsigned getCurrentDepth() const { return loopStack.size(); }
+
   /// Gets loop induction variable at the given level.
   Value getLoopIV(size_t level) const {
     if (level < loopStack.size())
@@ -509,6 +524,11 @@
   // sequence.
   std::vector<Value> loopSeqStack;
 
+  // Maps AffineDimExpr to the index of the loop in loopStack.
+  // TODO: We should probably use a callback function here to make it more
+  // general.
+  std::vector<unsigned> sparsiferLoopLvlMap;
+
   // TODO: not yet used, it should track the current level for each tensor
   // to help eliminate `dim` paramters from above APIs.
   // std::vector<size_t> curLv;
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -96,12 +96,14 @@
 
 SparseTensorLoopEmitter::SparseTensorLoopEmitter(ValueRange tensors,
                                                  bool hasOutput,
-                                                 bool isSparseOut)
+                                                 bool isSparseOut,
+                                                 ArrayRef<unsigned> topSort)
     : hasOutput(hasOutput), isSparseOut(isSparseOut),
       tensors(tensors.begin(), tensors.end()), dimTypes(tensors.size()),
       pidxs(tensors.size()), coord(tensors.size()), highs(tensors.size()),
       ptrBuffer(tensors.size()), idxBuffer(tensors.size()),
-      valBuffer(tensors.size()), loopStack() {
+      valBuffer(tensors.size()), loopStack(),
+      sparsiferLoopLvlMap(topSort.size(), 0) {
   for (size_t tid = 0, e = tensors.size(); tid < e; tid++) {
     auto t = tensors[tid];
     // a scalar or 0-dimension tensors
@@ -125,6 +127,13 @@
     ptrBuffer[tid].assign(rank, Value());
     idxBuffer[tid].assign(rank, Value());
   }
+
+  for (unsigned i = 0, e = topSort.size(); i < e; i++) {
+    // This is an inverse map of the topologically sorted loop index from
+    // sparsifier. This is needed to map the AffineDimExpr back to the loopStack
+    // index used in loop emitter.
+    sparsiferLoopLvlMap[topSort[i]] = i;
+  }
 }
 
 void SparseTensorLoopEmitter::initializeLoopEmit(
@@ -291,6 +300,108 @@
   return loop;
 }
 
+Value SparseTensorLoopEmitter::genAffine(OpBuilder &builder, AffineExpr a,
+                                         Location loc) {
+  switch (a.getKind()) {
+  case AffineExprKind::DimId: {
+    unsigned idx = a.cast<AffineDimExpr>().getPosition();
+    return loopStack[sparsiferLoopLvlMap[idx]].iv;
+  }
+  case AffineExprKind::Add: {
+    auto binOp = a.cast<AffineBinaryOpExpr>();
+    return builder.create<arith::AddIOp>(
+        loc, genAffine(builder, binOp.getLHS(), loc),
+        genAffine(builder, binOp.getRHS(), loc));
+  }
+  case AffineExprKind::Mul: {
+    auto binOp = a.cast<AffineBinaryOpExpr>();
+    return builder.create<arith::MulIOp>(
+        loc, genAffine(builder, binOp.getLHS(), loc),
+        genAffine(builder, binOp.getRHS(), loc));
+  }
+  case AffineExprKind::Constant: {
+    int64_t c = a.cast<AffineConstantExpr>().getValue();
+    return constantIndex(builder, loc, c);
+  }
+  default:
+    llvm_unreachable("unexpected affine subscript");
+  }
+}
+
+Operation *SparseTensorLoopEmitter::enterFilterLoopOverTensorAtDim(
+    OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine,
+    MutableArrayRef<Value> reduc) {
+  assert(!affine.isa<AffineDimExpr>() && !isDenseDLT(dimTypes[tid][dim]));
+  assert(dimTypes[tid].size() > dim);
+  // We can not re-enter the same level.
+  assert(!coord[tid][dim]);
+
+  Value step = constantIndex(builder, loc, 1);
+
+  Value lo = pidxs[tid][dim];
+  Value hi = highs[tid][dim];
+
+  // TODO: We should instead use a whileOp for filter loop to allow early
+  // break when exceeding (for ordered dimensions).
+  // TODO: There are many other potiential opportunities that we might apply in
+  // the future. E.g., we could use binary search to located the pointer index.
+  scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
+
+  // In-place update on the reduction variable vector.
+  assert(forOp.getNumRegionIterArgs() == reduc.size());
+  for (int i = 0, e = reduc.size(); i < e; i++)
+    reduc[i] = forOp.getRegionIterArg(i);
+
+  builder.setInsertionPointToStart(forOp.getBody());
+  Value iv = forOp.getInductionVar();
+
+  pidxs[tid][dim] = iv;
+  // Generating a load on the indices array yields the coordinate.
+  Value ptr = idxBuffer[tid][dim];
+  coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv);
+
+  // Generate a if condition to filter out indices that is not equal to the
+  // result of the affine expression.
+  Value expected = genAffine(builder, affine, loc);
+  auto pred = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
+                                            coord[tid][dim], expected);
+  SmallVector<Type> types;
+  for (Value red : reduc) {
+    types.push_back(red.getType());
+  }
+
+  bool hasReduc = !types.empty();
+  scf::IfOp ifOp =
+      builder.create<scf::IfOp>(loc, types, pred, /*else*/ hasReduc);
+  if (hasReduc) {
+    // scf.for (a) -> v
+    //  %s = scf.if (a) -> v
+    //    user-generated code.
+    //  else
+    //    yield a
+    //  yield %s
+    builder.create<scf::YieldOp>(loc, ifOp.getResults());
+    builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
+    // On mismatch.
+    builder.create<scf::YieldOp>(loc, reduc);
+  }
+  // Set the insert point to matched branch.
+  builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+
+  // NOTE: we can also prepares for next dim here in advance
+  // Push the loop into stack
+  loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), forOp,
+                         coord[tid][dim]);
+  return forOp;
+}
+
+void SparseTensorLoopEmitter::genDenseAffineAddressAtCurLevel(
+    OpBuilder &builder, Location loc, size_t tid, size_t dim,
+    AffineExpr affine) {
+  Value affineV = genAffine(builder, affine, loc);
+  pidxs[tid][dim] = genAddress(builder, loc, tid, dim, affineV);
+}
+
 Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims(
     OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
     ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc,
@@ -469,7 +580,6 @@
   if (forOp) {
     if (!reduc.empty()) {
       assert(reduc.size() == forOp.getNumResults());
-      rewriter.setInsertionPointToEnd(forOp.getBody());
       rewriter.create<scf::YieldOp>(loc, reduc);
     }
     // Exit the loop.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -27,6 +27,7 @@
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
 #include "mlir/Dialect/SparseTensor/Utils/Merger.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/TensorEncoding.h"
 #include "llvm/ADT/SmallBitVector.h"
@@ -57,7 +58,7 @@
           unsigned numLoops, OpOperand *op, unsigned nest,
           std::vector<unsigned> &ts)
       : options(o), loopEmitter(tensors, /*hasOutput=*/true,
-                                /*isSparseOut=*/op != nullptr),
+                                /*isSparseOut=*/op != nullptr, ts),
         sparseOut(op), outerParNest(nest), topSort(ts) {
     if (op)
       insChain = op->get();
@@ -85,6 +86,11 @@
   // Topsort (reference should remain in scope).
   std::vector<unsigned> &topSort;
 
+  ArrayRef<unsigned> getLoopCurStack() const {
+    ArrayRef<unsigned> topSortRef = topSort;
+    return topSortRef.slice(0, loopEmitter.getCurrentDepth());
+  }
+
   Value getLoopIdxValue(size_t loopIdx) const {
     for (unsigned lv = 0; lv < topSort.size(); lv++)
       if (topSort[lv] == loopIdx)
@@ -94,27 +100,104 @@
   }
 };
 
+class ParallelAffineDimFinder
+    : public AffineExprVisitor<ParallelAffineDimFinder> {
+  AffineExpr paraDim;
+  utils::IteratorType pickIterType;
+  SmallVector<utils::IteratorType> iterTypes;
+
+public:
+  explicit ParallelAffineDimFinder(linalg::GenericOp op)
+      : iterTypes(op.getIteratorTypesArray()) {}
+  void visitDimExpr(AffineDimExpr expr) {
+    if (paraDim == nullptr || pickIterType == iterTypes[expr.getPosition()]) {
+      paraDim = expr;
+    }
+  }
+
+  void setPickedIterType(utils::IteratorType iterType) {
+    pickIterType = iterType;
+  }
+
+  AffineDimExpr getDimExpr() const { return paraDim.cast<AffineDimExpr>(); }
+};
 } // namespace
 
 //===----------------------------------------------------------------------===//
 // Sparse compiler analysis methods.
 //===----------------------------------------------------------------------===//
 
+/// Determines if affine expression is invariant.
+static bool isInvariantAffine(AffineExpr a, ArrayRef<unsigned> loopStack,
+                              unsigned ldx, bool &atLevel) {
+  switch (a.getKind()) {
+  case AffineExprKind::DimId: {
+    unsigned idx = a.cast<AffineDimExpr>().getPosition();
+    if (idx == ldx) {
+      atLevel = true;
+      // Must be invariant if we are at the level.
+      return true;
+    }
+    bool isInvariant = false;
+    for (unsigned loop : loopStack) {
+      isInvariant = (loop == idx);
+      if (isInvariant)
+        break;
+    }
+    return isInvariant;
+  }
+  case AffineExprKind::Add:
+  case AffineExprKind::Mul: {
+    auto binOp = a.cast<AffineBinaryOpExpr>();
+    return isInvariantAffine(binOp.getLHS(), loopStack, ldx, atLevel) &&
+           isInvariantAffine(binOp.getRHS(), loopStack, ldx, atLevel);
+  }
+  default: {
+    assert(a.isa<AffineConstantExpr>());
+    return true;
+  }
+  }
+}
+
+/// Determines if affine expression is invariant.
+static bool isInvariantAffine(const CodeGen &codegen, AffineExpr a,
+                              unsigned ldx, bool &atLevel) {
+  return isInvariantAffine(a, codegen.getLoopCurStack(), ldx, atLevel);
+}
+
 /// Helper method to construct a permuted dimension ordering
 /// that adheres to the given topological sort.
-static AffineMap permute(MLIRContext *context, AffineMap m,
-                         std::vector<unsigned> &topSort) {
+static AffineMap permute(const Merger &merger, MLIRContext *context,
+                         AffineMap m, ArrayRef<unsigned> topSort) {
   unsigned sz = topSort.size();
-  assert(m.getNumResults() == sz && "TopoSort/AffineMap size mismatch");
+  assert(m.getNumDims() + merger.getNumFilterLoops() == sz &&
+         "TopoSort/AffineMap size mismatch");
   // Construct the inverse of `m`; to avoid the asymptotic complexity
   // of calling `m.getPermutedPosition` repeatedly.
-  SmallVector<unsigned, 4> inv(sz);
-  for (unsigned i = 0; i < sz; i++)
-    inv[i] = m.getDimPosition(i);
+  SmallVector<unsigned> perm;
+  unsigned numResults = m.getNumResults();
+  BitVector worklist(numResults, true);
+  unsigned loopDepth = 1;
   // Construct the permutation.
-  SmallVector<unsigned, 4> perm(sz);
-  for (unsigned i = 0; i < sz; i++)
-    perm[i] = inv[topSort[i]];
+  while (worklist.any()) {
+    unsigned preSize = perm.size();
+    for (auto dim : worklist.set_bits()) {
+      bool atLevel = false;
+      if (isInvariantAffine(m.getResult(dim), topSort.slice(0, loopDepth),
+                            topSort[loopDepth - 1], atLevel) &&
+          atLevel) {
+        perm.push_back(dim);
+      }
+    }
+    // Removes resolved dimension.
+    for (unsigned i = preSize, e = perm.size(); i < e; i++)
+      worklist.reset(perm[i]);
+
+    // Tries to entering the next loop level.
+    loopDepth += 1;
+  }
+
+  assert(perm.size() == numResults);
   return AffineMap::getPermutationMap(perm, context);
 }
 
@@ -122,7 +205,7 @@
 /// same index is used more than once. Also rejects compound affine
 /// expressions in sparse dimensions.
 static bool findAffine(Merger &merger, unsigned tensor, unsigned dim,
-                       AffineExpr a, DimLevelType dlt,
+                       AffineExpr a, DimLevelType dlt, unsigned &filterLdx,
                        bool setLvlFormat = true) {
   switch (a.getKind()) {
   case AffineExprKind::DimId: {
@@ -135,22 +218,58 @@
     return true;
   }
   case AffineExprKind::Add:
-  case AffineExprKind::Mul: {
-    if (!isDenseDLT(dlt))
-      return false; // compound only in dense dim
-    auto binOp = a.cast<AffineBinaryOpExpr>();
-    // We do not set dim level format for affine expresssion like d0 + d1 on
-    // both loop index at d0 and d1,
-    return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, false) &&
-           findAffine(merger, tensor, dim, binOp.getRHS(), dlt, false);
+  case AffineExprKind::Mul:
+  case AffineExprKind::Constant: {
+    if (!isDenseDLT(dlt) && setLvlFormat) {
+      assert(isUndefDLT(merger.getDimLevelType(tensor, filterLdx)));
+      // Use a filter loop for sparse affine expression.
+      merger.setDimAndDimLevelType(tensor, filterLdx++, dim, dlt);
+    }
+
+    if (auto binOp = a.dyn_cast<AffineBinaryOpExpr>()) {
+      // We do not set dim level format for affine expresssion like d0 + d1 on
+      // either loop index at d0 or d1/
+      // We continues the recursion merely to check whether current affine is
+      // admissible or not.
+      return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, filterLdx,
+                        false) &&
+             findAffine(merger, tensor, dim, binOp.getRHS(), dlt, filterLdx,
+                        false);
+    }
+    // Falls through when it is a constant Affine
+    return true;
   }
-  case AffineExprKind::Constant:
-    return isDenseDLT(dlt); // const only in dense dim
   default:
     return false;
   }
 }
 
+static unsigned getNumCompoundAffineOnSparseDims(AffineMap affineMap,
+                                                 Value tensor) {
+  unsigned num = 0;
+  auto enc = getSparseTensorEncoding(tensor.getType());
+  if (enc) {
+    ArrayRef<AffineExpr> exps = affineMap.getResults();
+    for (unsigned rank = 0; rank < exps.size(); rank++) {
+      auto aidx = toOrigDim(enc, rank);
+      auto affine = exps[aidx];
+      if (!affine.isa<AffineDimExpr>())
+        if (!isDenseDLT(getDimLevelType(enc, rank)))
+          num++;
+    }
+  }
+
+  return num;
+}
+
+static unsigned getNumCompoundAffineOnSparseDims(linalg::GenericOp op) {
+  unsigned num = 0;
+  for (OpOperand &t : op->getOpOperands())
+    num += getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(&t),
+                                            t.get());
+  return num;
+}
+
 /// Helper method to inspect sparse encodings in the tensor types.
 /// Fills the per-dimension sparsity information for all tensors.
 /// Returns true if the sparse annotations and affine subscript
@@ -158,19 +277,22 @@
 /// no annotations are found or inadmissible constructs occur.
 static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) {
   bool annotated = false;
+  unsigned filterLdx = merger.getFilterLoopStartingIdx();
   for (OpOperand &t : op->getOpOperands()) {
     auto map = op.getMatchingIndexingMap(&t);
     auto enc = getSparseTensorEncoding(t.get().getType());
     if (enc)
       annotated = true;
     assert(map.getNumResults() == op.getRank(&t));
+
     for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
       unsigned tensor = t.getOperandNumber();
       AffineExpr a = map.getResult(toOrigDim(enc, d));
-      if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d)))
+      if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d), filterLdx))
         return false; // inadmissible affine expression
     }
   }
+  assert(filterLdx == merger.getNumLoops());
   return annotated;
 }
 
@@ -180,34 +302,58 @@
 /// latest possible index.
 static bool topSortOptimal(unsigned n,
                            ArrayRef<utils::IteratorType> iteratorTypes,
-                           std::vector<unsigned> &topSort,
+                           const Merger &merger, std::vector<unsigned> &topSort,
                            std::vector<unsigned> &inDegree,
                            std::vector<std::vector<bool>> &adjM) {
-  std::vector<unsigned> redIt; // reduce iterator with 0 degree
-  std::vector<unsigned> parIt; // parallel iterator with 0 degree
+  std::vector<unsigned> redIt;    // reduce iterator with 0 degree
+  std::vector<unsigned> parIt;    // parallel iterator with 0 degree
+  std::vector<unsigned> filterIt; // filter loop with 0 degree
   for (unsigned i = 0; i < n; i++) {
     if (inDegree[i] == 0) {
-      if (linalg::isReductionIterator(iteratorTypes[i]))
+      if (merger.isFilterLoop(i))
+        filterIt.push_back(i);
+      else if (linalg::isReductionIterator(iteratorTypes[i]))
         redIt.push_back(i);
       else
         parIt.push_back(i);
     }
   }
 
-  while (!redIt.empty() || !parIt.empty()) {
-    // We always choose parallel iterator if there is any.
-    auto &it = !parIt.empty() ? parIt : redIt;
+  while (!redIt.empty() || !parIt.empty() || !filterIt.empty()) {
+    // We always choose in order of filter loop -> parallel loop -> reduction
+    // loop because
+    // 1. Putting reduction loop early might make the loop sequence
+    // inadmissible.
+    // 2. Filter Loop should be put as early as possible for better performance,
+    // since only one (if any) iteration will carry the computation. E.g.,
+    // for (1 to N)
+    //  for (1 to M)
+    //    for (1 to K)
+    //      if (xxx)
+    //        O(X) computation  => O(NMK+NMX) time complexity
+    //
+    // By putting the filter loop one level up, we got
+    //
+    // for (1 to N)
+    //  for (1 to K)
+    //    if (xxx)
+    //      for (1 to M)
+    //        O(X) computation  => O(NK+NMX) time complexity
+    auto &it = !filterIt.empty() ? filterIt : (!parIt.empty() ? parIt : redIt);
     auto src = it.back();
     topSort.push_back(src);
     it.pop_back();
     // Update in-degree, and push 0-degree node into worklist.
-    for (unsigned dst = 0; dst < n; dst++)
+    for (unsigned dst = 0; dst < n; dst++) {
       if (adjM[src][dst] && --inDegree[dst] == 0) {
-        if (linalg::isReductionIterator(iteratorTypes[dst]))
+        if (merger.isFilterLoop(dst))
+          filterIt.push_back(dst);
+        else if (linalg::isReductionIterator(iteratorTypes[dst]))
           redIt.push_back(dst);
         else
           parIt.push_back(dst);
       }
+    }
   }
   return topSort.size() == n;
 }
@@ -217,23 +363,38 @@
 /// example i0+i1 < i2+i3+1 yields i0<i2, i0<i3, i1<i2, and i1<i3.
 static void addAffineOrderings(std::vector<std::vector<bool>> &adjM,
                                std::vector<unsigned> &inDegree, AffineExpr a,
-                               AffineExpr b, unsigned fidx) {
-  switch (a.getKind()) {
-  case AffineExprKind::DimId: {
-    unsigned idx = a.cast<AffineDimExpr>().getPosition();
-    if (b)
-      addAffineOrderings(adjM, inDegree, b, AffineExpr(), idx);
-    else if (!adjM[fidx][idx]) {
-      adjM[fidx][idx] = true;
-      inDegree[idx]++;
+                               AffineExpr b, Optional<unsigned> fidx,
+                               Optional<unsigned> tidx) {
+  if (!a && !b) {
+    // Recursion leaf.
+    assert(fidx && tidx);
+    unsigned f = *fidx, t = *tidx;
+    if (!adjM[f][t]) {
+      adjM[f][t] = true;
+      inDegree[t]++;
     }
+    return;
+  }
+  auto toExpand = a ? a : b;
+  switch (toExpand.getKind()) {
+  case AffineExprKind::DimId: {
+    auto idx = toExpand.cast<AffineDimExpr>().getPosition();
+    if (toExpand == a)
+      addAffineOrderings(adjM, inDegree, AffineExpr(), b, idx, tidx);
+    else // toExpand == b
+      addAffineOrderings(adjM, inDegree, a, AffineExpr(), fidx, idx);
     break;
   }
   case AffineExprKind::Add:
   case AffineExprKind::Mul: {
-    auto binOp = a.cast<AffineBinaryOpExpr>();
-    addAffineOrderings(adjM, inDegree, binOp.getLHS(), b, fidx);
-    addAffineOrderings(adjM, inDegree, binOp.getRHS(), b, fidx);
+    auto binOp = toExpand.cast<AffineBinaryOpExpr>();
+    if (toExpand == a) {
+      addAffineOrderings(adjM, inDegree, binOp.getLHS(), b, fidx, tidx);
+      addAffineOrderings(adjM, inDegree, binOp.getRHS(), b, fidx, tidx);
+    } else {
+      addAffineOrderings(adjM, inDegree, a, binOp.getLHS(), fidx, tidx);
+      addAffineOrderings(adjM, inDegree, a, binOp.getRHS(), fidx, tidx);
+    }
     break;
   }
   default:
@@ -251,19 +412,16 @@
                                   OpOperand *skip = nullptr) {
   // Set up an n x n from/to adjacency matrix of the iteration graph
   // for the implicit loop indices i_0 .. i_n-1.
-  unsigned n = op.getNumLoops();
+  unsigned n = merger.getNumLoops();
   std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));
   std::vector<unsigned> inDegree(n, 0); // in-degree of each node.
   auto iteratorTypes = op.getIteratorTypesArray();
   // Iterate over the indexing maps of every tensor in the tensor expression.
   for (OpOperand &t : op->getOpOperands()) {
-    // Skip tensor during cycle resolution.
-    if (&t == skip)
-      continue;
     // Get map and encoding.
     auto map = op.getMatchingIndexingMap(&t);
     auto enc = getSparseTensorEncoding(t.get().getType());
-    assert(map.getNumDims() == n);
+    assert(map.getNumDims() + getNumCompoundAffineOnSparseDims(op) == n);
     // Skip dense tensor constraints when not requested.
     if (!(mask & SortMask::kIncludeDense) && !enc)
       continue;
@@ -271,10 +429,84 @@
     // by default) puts an ordering constraint on the loop indices. For
     // example, the tensor expresion A_ijk forces the ordering i < j < k
     // on the loop indices if no explicit dimension ordering is given.
-    for (unsigned d = 1, rank = map.getNumResults(); d < rank; d++) {
-      AffineExpr f = map.getResult(toOrigDim(enc, d - 1));
-      AffineExpr t = map.getResult(toOrigDim(enc, d));
-      addAffineOrderings(adjM, inDegree, f, t, 0);
+    for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
+      AffineExpr ta = map.getResult(toOrigDim(enc, d));
+      Optional<unsigned> tldx = merger.getLoopIdx(t.getOperandNumber(), d);
+
+      // Filter loops should be constructed after all the dependent loops,
+      // i.e., d0 + d1 < filter_loop(d0 + d1)
+      if (tldx && merger.isFilterLoop(tldx.value())) {
+        assert(!ta.isa<AffineDimExpr>() &&
+               !isDenseDLT(getDimLevelType(enc, d)));
+        addAffineOrderings(adjM, inDegree, ta, AffineExpr(), llvm::None, tldx);
+        // Now that the ordering of affine expression is captured by filter
+        // loop idx, we only need to ensure the affine ordering against filter
+        // loop. Thus, we reset the affine express to nil here to mark it as
+        // resolved.
+        ta = AffineExpr();
+      }
+
+      // Skip tensor during cycle resolution, though order between filter loop
+      // and dependent loops need to be guaranteed unconditionally.
+      if (&t == skip)
+        continue;
+
+      if (d > 0) {
+        AffineExpr fa = map.getResult(toOrigDim(enc, d - 1));
+        Optional<unsigned> fldx =
+            merger.getLoopIdx(t.getOperandNumber(), d - 1);
+
+        if (fldx && merger.isFilterLoop(fldx.value())) {
+          // This must be a compound affine expression on sparse dimension.
+          assert(!fa.isa<AffineDimExpr>() &&
+                 !isDenseDLT(getDimLevelType(enc, d - 1)));
+          // For the same reason above.
+          fa = AffineExpr();
+        }
+
+        if (!(mask & SortMask::kIncludeDense) && !tldx) {
+          ParallelAffineDimFinder finder(op);
+          // e.g, for [dense, dense] -> (d0 + d1, d2 + d3)
+          // It is totally fine to have loop sequence d0->d2->d1->d3 instead of
+          // requiring d0 < d2, d1 < d2, d0 < d3, d1 < d3.
+          // We use a heuristic here to only pick one dim expression from each
+          // compound affine expression to establish the order between two dense
+          // dimensions.
+          // NOTE: The ordering can only be loosen when the destination level is
+          // dense, for [dense, sparse] -> (d0 + d1, d2), we still require both
+          // d0 < d2 and d1 < d2 to ensure correct ordering (i.e., no ordering
+          // like d0->d2->d1).
+          // TODO: this is obviously a sub optimal solution.
+          if (!fldx && fa.isa<AffineConstantExpr>()) {
+            assert(isDenseDLT(getDimLevelType(enc, d - 1)) &&
+                   !fa.isa<AffineDimExpr>());
+            // Heuristic: we prefer parallel loop for lhs to reduce the chance
+            // we add reduce < parallel ordering.
+            finder.setPickedIterType(utils::IteratorType::parallel);
+            finder.walkPostOrder(fa);
+            fa = finder.getDimExpr();
+            fldx = finder.getDimExpr().getPosition();
+          }
+          if (!ta.isa<AffineConstantExpr>()) {
+            // Dense compound affine
+            assert(isDenseDLT(getDimLevelType(enc, d)) &&
+                   !ta.isa<AffineDimExpr>());
+            // Heuristic: we prefer reduction loop for rhs to reduce the chance
+            // addint reduce < parallel ordering.
+            finder.setPickedIterType(utils::IteratorType::reduction);
+            finder.walkPostOrder(ta);
+            ta = finder.getDimExpr();
+            tldx = finder.getDimExpr().getPosition();
+          }
+        }
+
+        // (d0 + d1) < (d2 + d3), or
+        // filter_loop_d-1 < (d2 + d3), or
+        // (d0 + d1) < filter_loop_d, or
+        // filter_loop_d-1 < filter_loop_d depending on whether fa/ta is reset
+        // above.
+        addAffineOrderings(adjM, inDegree, fa, ta, fldx, tldx);
+      }
     }
     // Push unrelated loops into sparse iteration space, so these
     // will be skipped more often.
@@ -298,7 +530,7 @@
   // Report failure for a cyclic iteration graph.
   topSort.clear();
   topSort.reserve(n);
-  return topSortOptimal(n, iteratorTypes, topSort, inDegree, adjM);
+  return topSortOptimal(n, iteratorTypes, merger, topSort, inDegree, adjM);
 }
 
 /// Returns true if tensor materializes uninitialized into the computation.
@@ -326,9 +558,8 @@
   // An all-dense annotated "sparse" output tensor becomes a linearized random
   // access 1-dim memref. Also admissible since insertions cannot occur.
   bool allDense = true;
-  auto iteratorTypes = op.getIteratorTypesArray();
-  unsigned numLoops = iteratorTypes.size();
-  for (unsigned i = 0; i < numLoops; i++)
+  unsigned numLoops = merger.getNumLoops(); // numNativeLoops + numFilterLoops
+  for (unsigned i = 0; i < merger.getNumLoops(); i++)
     if (isCompressedDLT(merger.getDimLevelType(tensor, i)) ||
         isSingletonDLT(merger.getDimLevelType(tensor, i))) {
       allDense = false;
@@ -339,19 +570,31 @@
     }
   if (allDense)
     return true;
+
+  // TODO: support compound affine expression on sparse output.
+  if (getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(lhs),
+                                       lhs->get()) != 0)
+    return false;
+
   // A tensor expression with a sparse output tensor that changes its values
   // but not its nonzero structure, an operation called "simply dynamic" in
   // [Bik96,Ch9], is also admissible without special codegen.
   if (merger.isSingleCondition(tensor, exp))
     return true;
+
   // Accept "truly dynamic" if the output tensor materializes uninitialized
   // into the computation and insertions occur in lexicographic index order.
   if (isMaterializing(lhs->get())) {
+    auto iteratorTypes = op.getIteratorTypesArray();
     unsigned nest = 0;
     for (unsigned i = 0; i < numLoops; i++) {
-      if (linalg::isReductionIterator(iteratorTypes[topSort[i]]))
-        break; // terminate at first reduction
-      nest++;
+      if (!merger.isFilterLoop(topSort[i])) {
+        // We only count non-filter loops as filter loops should be considered
+        // as a special type of parallel loops.
+        if (linalg::isReductionIterator(iteratorTypes[topSort[i]]))
+          break; // terminate at first reduction
+        nest++;
+      }
     }
     // Determine admissible dynamic insertion situations:
     // (1) fully injective, since there are no reductions,
@@ -448,12 +691,12 @@
   codegen.loopEmitter.initializeLoopEmit(
       builder, loc,
       /// Generates buffer for the output tensor.
-      /// Note that all sparse kernels assume that when all elements are written
-      /// to (viz. x(i) = y(i) * z(i)), the output buffer is already initialized
-      /// to all zeroes and only nonzeroes values are computed and written out.
-      /// For updates (viz. x(i) += y(i) * z(i)), only nonzeroes values are used
-      /// for the updates and no assumption on the original contents of the
-      /// output buffer is necessary.
+      /// Note that all sparse kernels assume that when all elements are
+      /// written to (viz. x(i) = y(i) * z(i)), the output buffer is already
+      /// initialized to all zeroes and only nonzeroes values are computed and
+      /// written out. For updates (viz. x(i) += y(i) * z(i)), only nonzeroes
+      /// values are used for the updates and no assumption on the original
+      /// contents of the output buffer is necessary.
       [&op](OpBuilder &builder, Location loc, Value memref,
             Value tensor) -> Value {
         // Must not be a sparse tensor.
@@ -462,13 +705,13 @@
         // Two output tensors references should pointed to the same object.
         assert(lhs->get() == tensor);
         bool isInit = op.isInitTensor(lhs);
-        // An output tensor can simply materialize from the buffer of the tensor
-        // that appears in the outs() clause. For updates, this has the
+        // An output tensor can simply materialize from the buffer of the
+        // tensor that appears in the outs() clause. For updates, this has the
         // advantage that only the nonzero value are involved in the
-        // computation, keeping the operation O(nnz). In all other cases, we are
-        // forced to zero out the buffer to enforce the assumption above, which
-        // may negatively impact running complexity (viz. O(n^2 + nnz) vs.
-        // O(nnz) for matrices).
+        // computation, keeping the operation O(nnz). In all other cases, we
+        // are forced to zero out the buffer to enforce the assumption above,
+        // which may negatively impact running complexity (viz. O(n^2 + nnz)
+        // vs. O(nnz) for matrices).
         // TODO: use better analysis to avoid zeroing out the buffer?
         Value init = memref;
         if (!isInit) {
@@ -481,38 +724,6 @@
       });
 }
 
-/// Generates an affine expression.
-//
-// TODO: generalize for sparse tensor subscripts
-//
-static Value genAffine(CodeGen &codegen, OpBuilder &builder, AffineExpr a,
-                       Location loc) {
-  switch (a.getKind()) {
-  case AffineExprKind::DimId: {
-    unsigned idx = a.cast<AffineDimExpr>().getPosition();
-    return codegen.getLoopIdxValue(idx); // universal dense index
-  }
-  case AffineExprKind::Add: {
-    auto binOp = a.cast<AffineBinaryOpExpr>();
-    return builder.create<arith::AddIOp>(
-        loc, genAffine(codegen, builder, binOp.getLHS(), loc),
-        genAffine(codegen, builder, binOp.getRHS(), loc));
-  }
-  case AffineExprKind::Mul: {
-    auto binOp = a.cast<AffineBinaryOpExpr>();
-    return builder.create<arith::MulIOp>(
-        loc, genAffine(codegen, builder, binOp.getLHS(), loc),
-        genAffine(codegen, builder, binOp.getRHS(), loc));
-  }
-  case AffineExprKind::Constant: {
-    int64_t c = a.cast<AffineConstantExpr>().getValue();
-    return constantIndex(builder, loc, c);
-  }
-  default:
-    llvm_unreachable("unexpected affine subscript");
-  }
-}
-
 /// Generates index for load/store on sparse tensor.
 static Value genIndex(CodeGen &codegen, linalg::GenericOp op, OpOperand *t) {
   auto map = op.getMatchingIndexingMap(t);
@@ -533,16 +744,13 @@
   unsigned rank = map.getNumResults();
   if (enc) {
     // Note that currently, all sparse subscripts are simple.
-    // TODO: accept affine too?
-    assert(map.getResult(toOrigDim(enc, rank - 1)).getKind() ==
-           AffineExprKind::DimId);
     Value pidx = codegen.loopEmitter.getPidxs()[tensor].back();
     assert(pidx);
     args.push_back(pidx); // position index
   } else {
     for (unsigned d = 0; d < rank; d++) {
       AffineExpr a = map.getResult(d);
-      args.push_back(genAffine(codegen, builder, a, op.getLoc()));
+      args.push_back(codegen.loopEmitter.genAffine(builder, a, op.getLoc()));
     }
   }
   return codegen.loopEmitter.getValBuffer()[tensor];
@@ -774,27 +982,6 @@
   return ee;
 }
 
-/// Determines if affine expression is invariant.
-static bool isInvariantAffine(const CodeGen &codegen, AffineExpr a,
-                              unsigned ldx, bool &atLevel) {
-  switch (a.getKind()) {
-  case AffineExprKind::DimId: {
-    unsigned idx = a.cast<AffineDimExpr>().getPosition();
-    if (idx == ldx)
-      atLevel = true;
-    return codegen.getLoopIdxValue(idx) != nullptr; // no longer in play?
-  }
-  case AffineExprKind::Add:
-  case AffineExprKind::Mul: {
-    auto binOp = a.cast<AffineBinaryOpExpr>();
-    return isInvariantAffine(codegen, binOp.getLHS(), ldx, atLevel) &&
-           isInvariantAffine(codegen, binOp.getRHS(), ldx, atLevel);
-  }
-  default:
-    return true;
-  }
-}
-
 /// Hoists loop invariant tensor loads for which indices have been exhausted.
 static void genInvariants(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                           linalg::GenericOp op, unsigned exp, unsigned ldx,
@@ -809,7 +996,14 @@
     auto enc = getSparseTensorEncoding(t.get().getType());
     for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
       AffineExpr a = map.getResult(toOrigDim(enc, d));
-      if (!isInvariantAffine(codegen, a, ldx, atLevel))
+      Optional<unsigned> sldx = merger.getLoopIdx(t.getOperandNumber(), d);
+      if (sldx && merger.isFilterLoop(sldx.value())) {
+        if (!codegen.getLoopIdxValue(sldx.value()))
+          // The filter loops has not been constructed.
+          return;
+        if (sldx.value() == ldx)
+          atLevel = true;
+      } else if (!isInvariantAffine(codegen, a, ldx, atLevel))
         return; // still in play
     }
     // All exhausted at this level (atLevel denotes exactly at this level).
@@ -928,13 +1122,22 @@
                          ArrayRef<size_t> extraTids,
                          ArrayRef<size_t> extraDims) {
   Location loc = op.getLoc();
-  auto iteratorTypes = op.getIteratorTypesArray();
   bool isSparse = isCompressedDLT(merger.getDimLevelType(tid, idx)) ||
                   isSingletonDLT(merger.getDimLevelType(tid, idx));
   bool isParallel = isParallelFor(codegen, isOuter, isSparse);
 
   Operation *loop =
       genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
+        if (merger.isFilterLoop(idx)) {
+          assert(isSparse);
+          OpOperand *t = &op->getOpOperand(tid);
+          auto enc = getSparseTensorEncoding(t->get().getType());
+          // Retrieves the affine expression for the filter loop.
+          AffineExpr a =
+              op.getMatchingIndexingMap(t).getResult(toOrigDim(enc, dim));
+          return codegen.loopEmitter.enterFilterLoopOverTensorAtDim(
+              builder, loc, tid, dim, a, reduc);
+        }
         return codegen.loopEmitter.enterLoopOverTensorAtDim(
             builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims);
       }).value();
@@ -1120,12 +1323,13 @@
   return false;
 }
 
-static void translateBitsToTidDimPairs(Merger &merger, CodeGen &codegen,
-                                       unsigned li, unsigned idx,
-                                       SmallVectorImpl<size_t> &condTids,
-                                       SmallVectorImpl<size_t> &condDims,
-                                       SmallVectorImpl<size_t> &extraTids,
-                                       SmallVectorImpl<size_t> &extraDims) {
+static void translateBitsToTidDimPairs(
+    Merger &merger, CodeGen &codegen, linalg::GenericOp op, unsigned li,
+    unsigned idx, SmallVectorImpl<size_t> &condTids,
+    SmallVectorImpl<size_t> &condDims, SmallVectorImpl<size_t> &extraTids,
+    SmallVectorImpl<size_t> &extraDims, SmallVectorImpl<size_t> &affineTids,
+    SmallVectorImpl<size_t> &affineDims, SmallVectorImpl<AffineExpr> &exps) {
+
   const BitVector &all = merger.lat(li).bits;
   const BitVector &simple = merger.lat(li).simple;
 
@@ -1153,6 +1357,60 @@
       // TODO: get rid of extraTids and extraDims.
       extraTids.push_back(tid);
       extraDims.push_back(dim.value());
+    } else {
+      assert(isUndefDLT(dlt));
+      if (tid >= op.getNumDpsInputs())
+        // We only handle affine expression on input tensors (for now).
+        return;
+      OpOperand *operand = &op->getOpOperand(tid);
+      auto enc = getSparseTensorEncoding(operand->get().getType());
+      if (!enc)
+        // Non-annotated dense tensors requires no special handling.
+        return;
+
+      ArrayRef<AffineExpr> affines =
+          op.getMatchingIndexingMap(operand).getResults();
+      assert(affines.size() == enc.getDimLevelType().size());
+      for (unsigned i = 0, e = affines.size(); i < e; i++) {
+        AffineExpr exp = affines[toOrigDim(enc, i)];
+        if (exp.isa<AffineDimExpr>() || !isDenseDLT(getDimLevelType(enc, i)))
+          // Skip simple affine expression and non dense dimensions (which has
+          // it own filter loop).
+          continue;
+
+        // Constant affine expressions on dense level required to be generated
+        // when
+        // 1. The previous level is an (at-level) invariant compound dense
+        // affine (with no corresponding loop idx); or
+        // 2. The previous level is being generated right now.
+        if (exp.isa<AffineConstantExpr>()) {
+          // TODO:  Should we come up with a more adhersive way to handle
+          // constant expression? We now requires two (somehow ad-hoc) code for
+          // it.
+          if (i != 0 && // i == 0 cases are handled in genConstantDenseAddress
+              ((!affineTids.empty() && affineTids.back() == tid &&
+                affineDims.back() == i - 1) ||          // Condition 1
+               merger.getLoopIdx(tid, i - 1) == idx)) { // Condition 2
+            affineTids.push_back(tid);
+            affineDims.push_back(i);
+            exps.push_back(exp);
+          }
+        } else {
+          bool atLevel = false;
+          if (isInvariantAffine(codegen, exp, idx, atLevel) && atLevel) {
+            // If the compound affine is invariant and we are right at the
+            // level. We need to generate the address according to the affine
+            // expression. This is also the best place we can do it to avoid
+            // putting it inside inner loops.
+            // NOTE: It assumes that the levels of the input tensor are
+            // initialized in order, another more admissible approach might be
+            // accepting out-of-order access between consecutive dense levels.
+            affineTids.push_back(tid);
+            affineDims.push_back(i);
+            exps.push_back(exp);
+          }
+        }
+      }
     }
   });
 
@@ -1160,7 +1418,6 @@
     // Note that we generate dense indices of the output tensor
     // unconditionally, since they may not appear in the lattice, but may be
     // needed for linearized codegen.
-    // Only dense dimensions should be optimized from conditions.
     auto dim = merger.getDimNum(merger.getOutTensorID(), idx).value();
     extraTids.push_back(merger.getOutTensorID());
     extraDims.push_back(dim);
@@ -1176,12 +1433,23 @@
   // The set of (dense) tensors that is optimized from condition, yet still
   // need extra locals to iterate on them.
   SmallVector<size_t, 4> extraTids, extraDims;
-
-  translateBitsToTidDimPairs(merger, codegen, li, codegen.topSort[at], condTids,
-                             condDims, extraTids, extraDims);
+  // The set of dense tensors with non-trivial affine expression that just
+  // becomes invariant and the address shall now be generated at the current
+  // level.
+  SmallVector<size_t, 4> affineTids, affineDims;
+  SmallVector<AffineExpr, 4> affines;
+
+  translateBitsToTidDimPairs(merger, codegen, op, li, codegen.topSort[at],
+                             condTids, condDims, extraTids, extraDims,
+                             affineTids, affineDims, affines);
   // Emit the for/while-loop control.
   Operation *loop = genLoop(merger, codegen, builder, op, at, needsUniv,
                             condTids, condDims, extraTids, extraDims);
+
+  for (auto [tid, dim, exp] : llvm::zip(affineTids, affineDims, affines)) {
+    codegen.loopEmitter.genDenseAffineAddressAtCurLevel(builder, op.getLoc(),
+                                                        tid, dim, exp);
+  }
   return loop;
 }
 
@@ -1217,6 +1485,32 @@
   genExpansion(merger, codegen, builder, op, at, /*atStart=*/false);
 }
 
+static void genConstantDenseAddress(CodeGen &codegen, RewriterBase &rewriter,
+                                    linalg::GenericOp op) {
+  // We can generates address for constant affine expression before any loops
+  // starting from the first level as they do not depend on any thing.
+  // E.g., [Dense, Dense, Sparse] -> (1, 2, d0), the addresses for the first two
+  // levels can be determined before loops.
+  for (OpOperand *input : op.getDpsInputOperands()) {
+    ArrayRef<AffineExpr> affines =
+        op.getMatchingIndexingMap(input).getResults();
+    auto enc = getSparseTensorEncoding(input->get().getType());
+    if (enc) {
+      for (unsigned i = 0, e = affines.size(); i < e; i++) {
+        AffineExpr affine = affines[toOrigDim(enc, i)];
+        if (isDenseDLT(getDimLevelType(enc, i)) &&
+            affine.isa<AffineConstantExpr>()) {
+          codegen.loopEmitter.genDenseAffineAddressAtCurLevel(
+              rewriter, op.getLoc(), input->getOperandNumber(), i, affine);
+        } else {
+          // Breaks on first non-dense non-constant level.
+          break;
+        }
+      }
+    }
+  }
+}
+
 /// Recursively generates code while computing iteration lattices in order
 /// to manage the complexity of implementing co-iteration over unions
 /// and intersections of sparse iterations spaces.
@@ -1309,7 +1603,6 @@
 //===----------------------------------------------------------------------===//
 
 namespace {
-
 /// Sparse rewriting rule for generic Lingalg operation.
 struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
 public:
@@ -1324,7 +1617,8 @@
       return failure();
     unsigned numTensors = op->getNumOperands();
     unsigned numLoops = op.getNumLoops();
-    Merger merger(numTensors, numLoops);
+    unsigned numFilterLoops = getNumCompoundAffineOnSparseDims(op);
+    Merger merger(numTensors, numLoops, numFilterLoops);
     if (!findSparseAnnotations(merger, op))
       return failure();
 
@@ -1338,9 +1632,9 @@
     unsigned outerParNest = 0;
     // Computes a topologically sorted iteration graph to ensure tensors
     // are visited in natural index order. Gradually relaxes the considered
-    // constraints until an acyclic iteration graph results, such that sparse
-    // code generation can proceed. As a last resort, an attempt is made
-    // to resolve cycles by inserting a conversion.
+    // constraints until an acyclic iteration graph results, such that
+    // sparse code generation can proceed. As a last resort, an attempt is
+    // made to resolve cycles by inserting a conversion.
     std::vector<unsigned> topSort;
     // Whether the current GenericOp is admissible.
     bool isAdmissible = false;
@@ -1377,6 +1671,7 @@
     CodeGen codegen(options, tensors, numTensors, numLoops, sparseOut,
                     outerParNest, topSort);
     genBuffers(merger, codegen, rewriter, op);
+    genConstantDenseAddress(codegen, rewriter, op);
     genStmt(merger, codegen, rewriter, op, exp, 0);
     genResult(merger, codegen, rewriter, op);
     return success();
@@ -1407,7 +1702,7 @@
       auto srcTp = tval.getType().cast<RankedTensorType>();
       auto dstEnc = SparseTensorEncodingAttr::get(
           op->getContext(), srcEnc.getDimLevelType(),
-          permute(getContext(), op.getMatchingIndexingMap(t),
+          permute(merger, getContext(), op.getMatchingIndexingMap(t),
                   topSort), // new order
           srcEnc.getHigherOrdering(), srcEnc.getPointerBitWidth(),
           srcEnc.getIndexBitWidth());
diff --git a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
@@ -2,6 +2,7 @@
 // RUN: mlir-opt %s -sparsification | FileCheck %s
 
 #SpVec = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>
+#EncDenseVec = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }>
 #CSR   = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>
 
 #trait1 = {
@@ -54,6 +55,94 @@
   return %0 : tensor<32xf32>
 }
 
+// CHECK-LABEL:   func.func @mul_inv_sparse1d(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>,
+// CHECK-SAME:                                %[[VAL_1:.*]]: tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>>)
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 3 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_6:.*]] = bufferization.alloc_tensor() : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf32>
+// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 0 : index} : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf32>
+// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_6]]) -> (tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:             %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK:             %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_4]] : index
+// CHECK:             %[[VAL_19:.*]] = scf.if %[[VAL_18]] -> (tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:               %[[VAL_20:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_15]]] : memref<?xf32>
+// CHECK:               %[[VAL_21:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK:               %[[VAL_22:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK:               %[[VAL_23:.*]] = scf.for %[[VAL_24:.*]] = %[[VAL_21]] to %[[VAL_22]] step %[[VAL_3]] iter_args(%[[VAL_25:.*]] = %[[VAL_16]]) -> (tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:                 %[[VAL_26:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_24]]] : memref<?xf32>
+// CHECK:                 %[[VAL_27:.*]] = arith.mulf %[[VAL_26]], %[[VAL_20]] : f32
+// CHECK:                 %[[VAL_28:.*]] = arith.addf %[[VAL_27]], %[[VAL_5]] : f32
+// CHECK:                 %[[VAL_29:.*]] = sparse_tensor.insert %[[VAL_28]] into %[[VAL_25]]{{\[}}%[[VAL_17]]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:                 scf.yield %[[VAL_29]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:               }
+// CHECK:               scf.yield %[[VAL_30:.*]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:             } else {
+// CHECK:               scf.yield %[[VAL_16]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:             }
+// CHECK:             scf.yield %[[VAL_31:.*]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           }
+// CHECK:           %[[VAL_32:.*]] = sparse_tensor.load %[[VAL_33:.*]] hasInserts : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           return %[[VAL_32]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+func.func @mul_inv_sparse1d(%arga: tensor<32xf32, #SpVec>,
+                            %argb: tensor<4xf32, #SpVec>) -> tensor<32xf32, #SpVec> {
+  %argx = bufferization.alloc_tensor() : tensor<32xf32, #SpVec>
+  %0 = linalg.generic #trait1
+     ins(%arga, %argb: tensor<32xf32, #SpVec>, tensor<4xf32, #SpVec>)
+    outs(%argx: tensor<32xf32, #SpVec>) {
+      ^bb(%a: f32, %b: f32, %x: f32):
+        %0 = arith.mulf %a, %b : f32
+        %1 = arith.addf %x, %0 : f32
+        linalg.yield %1 : f32
+  } -> tensor<32xf32, #SpVec>
+  return %0 : tensor<32xf32, #SpVec>
+}
+
+
+// CHECK-LABEL:   func.func @mul_inv_enc_dense1d(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 32 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 3 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_6:.*]] = bufferization.alloc_tensor() : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf32>
+// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf32>
+// CHECK:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_6]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf32>
+// CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xf32>
+// CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_4]] to %[[VAL_2]] step %[[VAL_5]] {
+// CHECK:             %[[VAL_12:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_11]]] : memref<?xf32>
+// CHECK:             %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_11]]] : memref<?xf32>
+// CHECK:             %[[VAL_14:.*]] = arith.mulf %[[VAL_13]], %[[VAL_10]] : f32
+// CHECK:             %[[VAL_15:.*]] = arith.addf %[[VAL_12]], %[[VAL_14]] : f32
+// CHECK:             memref.store %[[VAL_15]], %[[VAL_9]]{{\[}}%[[VAL_11]]] : memref<?xf32>
+// CHECK:           }
+// CHECK:           %[[VAL_16:.*]] = sparse_tensor.load %[[VAL_6]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           return %[[VAL_16]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:         }
+func.func @mul_inv_enc_dense1d(%arga: tensor<32xf32, #EncDenseVec>,
+                            %argb: tensor<4xf32, #EncDenseVec>) -> tensor<32xf32, #EncDenseVec> {
+  %argx = bufferization.alloc_tensor() : tensor<32xf32, #EncDenseVec>
+  %0 = linalg.generic #trait1
+     ins(%arga, %argb: tensor<32xf32, #EncDenseVec>, tensor<4xf32, #EncDenseVec>)
+    outs(%argx: tensor<32xf32, #EncDenseVec>) {
+      ^bb(%a: f32, %b: f32, %x: f32):
+        %0 = arith.mulf %a, %b : f32
+        %1 = arith.addf %x, %0 : f32
+        linalg.yield %1 : f32
+  } -> tensor<32xf32, #EncDenseVec>
+  return %0 : tensor<32xf32, #EncDenseVec>
+}
+
 #trait2 = {
   indexing_maps = [
     affine_map<(i) -> (i)>,  // a
@@ -104,6 +193,58 @@
   return %0 : tensor<32xi32>
 }
 
+// CHECK-LABEL:   func.func @and_affine_sparse1d(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>>)
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_5:.*]] = bufferization.alloc_tensor() : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xi32>
+// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 0 : index} : tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xi32>
+// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_5]]) -> (tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:             %[[VAL_17:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK:             %[[VAL_18:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_15]]] : memref<?xi32>
+// CHECK:             %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK:             %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK:             %[[VAL_21:.*]] = scf.for %[[VAL_22:.*]] = %[[VAL_19]] to %[[VAL_20]] step %[[VAL_3]] iter_args(%[[VAL_23:.*]] = %[[VAL_16]]) -> (tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:               %[[VAL_24:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_22]]] : memref<?xindex>
+// CHECK:               %[[VAL_25:.*]] = arith.addi %[[VAL_17]], %[[VAL_4]] : index
+// CHECK:               %[[VAL_26:.*]] = arith.cmpi eq, %[[VAL_24]], %[[VAL_25]] : index
+// CHECK:               %[[VAL_27:.*]] = scf.if %[[VAL_26]] -> (tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:                 %[[VAL_28:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_22]]] : memref<?xi32>
+// CHECK:                 %[[VAL_29:.*]] = arith.andi %[[VAL_18]], %[[VAL_28]] : i32
+// CHECK:                 %[[VAL_30:.*]] = sparse_tensor.insert %[[VAL_29]] into %[[VAL_23]]{{\[}}%[[VAL_17]]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:                 scf.yield %[[VAL_30]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:               } else {
+// CHECK:                 scf.yield %[[VAL_23]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:               }
+// CHECK:               scf.yield %[[VAL_31:.*]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:             }
+// CHECK:             scf.yield %[[VAL_32:.*]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           }
+// CHECK:           %[[VAL_33:.*]] = sparse_tensor.load %[[VAL_34:.*]] hasInserts : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           return %[[VAL_33]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+func.func @and_affine_sparse1d(%arga: tensor<32xi32, #SpVec>,
+                               %argb: tensor<34xi32, #SpVec>) -> tensor<32xi32, #SpVec> {
+  %argx = bufferization.alloc_tensor() : tensor<32xi32, #SpVec>
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32xi32, #SpVec>, tensor<34xi32, #SpVec>)
+    outs(%argx: tensor<32xi32, #SpVec>) {
+      ^bb(%a: i32, %b: i32, %x: i32):
+        %0 = arith.andi %a, %b : i32
+        linalg.yield %0 : i32
+  } -> tensor<32xi32, #SpVec>
+  return %0 : tensor<32xi32, #SpVec>
+}
+
+
 #trait3 = {
   indexing_maps = [
     affine_map<(i,j) -> (i,j)>,      // a
@@ -160,3 +301,66 @@
   } -> tensor<32x16xf64>
   return %0 : tensor<32x16xf64>
 }
+
+// CHECK-LABEL:   func.func @mul_affine_sparse2d(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 32 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           %[[VAL_7:.*]] = arith.constant 3 : index
+// CHECK:           %[[VAL_8:.*]] = bufferization.alloc_tensor() : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
+// CHECK:           %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 1 : index} : tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 1 : index} : tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
+// CHECK:           %[[VAL_15:.*]] = scf.for %[[VAL_16:.*]] = %[[VAL_3]] to %[[VAL_2]] step %[[VAL_4]] iter_args(%[[VAL_17:.*]] = %[[VAL_8]]) -> (tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:             %[[VAL_18:.*]] = arith.addi %[[VAL_16]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_16]]] : memref<?xindex>
+// CHECK:             %[[VAL_20:.*]] = arith.addi %[[VAL_16]], %[[VAL_4]] : index
+// CHECK:             %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_20]]] : memref<?xindex>
+// CHECK:             %[[VAL_22:.*]] = scf.for %[[VAL_23:.*]] = %[[VAL_19]] to %[[VAL_21]] step %[[VAL_4]] iter_args(%[[VAL_24:.*]] = %[[VAL_17]]) -> (tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:               %[[VAL_25:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_23]]] : memref<?xindex>
+// CHECK:               %[[VAL_26:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_23]]] : memref<?xf64>
+// CHECK:               %[[VAL_27:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_18]]] : memref<?xindex>
+// CHECK:               %[[VAL_28:.*]] = arith.addi %[[VAL_18]], %[[VAL_4]] : index
+// CHECK:               %[[VAL_29:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:               %[[VAL_30:.*]]:2 = scf.for %[[VAL_31:.*]] = %[[VAL_27]] to %[[VAL_29]] step %[[VAL_4]] iter_args(%[[VAL_32:.*]] = %[[VAL_6]], %[[VAL_33:.*]] = %[[VAL_24]]) -> (f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:                 %[[VAL_34:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_31]]] : memref<?xindex>
+// CHECK:                 %[[VAL_35:.*]] = arith.addi %[[VAL_25]], %[[VAL_7]] : index
+// CHECK:                 %[[VAL_36:.*]] = arith.cmpi eq, %[[VAL_34]], %[[VAL_35]] : index
+// CHECK:                 %[[VAL_37:.*]]:2 = scf.if %[[VAL_36]] -> (f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:                   %[[VAL_38:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_31]]] : memref<?xf64>
+// CHECK:                   %[[VAL_39:.*]] = arith.mulf %[[VAL_26]], %[[VAL_38]] : f64
+// CHECK:                   %[[VAL_40:.*]] = arith.addf %[[VAL_32]], %[[VAL_39]] : f64
+// CHECK:                   scf.yield %[[VAL_40]], %[[VAL_33]] : f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:                 } else {
+// CHECK:                   scf.yield %[[VAL_32]], %[[VAL_33]] : f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:                 }
+// CHECK:                 scf.yield %[[VAL_41:.*]]#0, %[[VAL_41]]#1 : f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:               }
+// CHECK:               %[[VAL_42:.*]] = sparse_tensor.insert %[[VAL_43:.*]]#0 into %[[VAL_43]]#1{{\[}}%[[VAL_16]], %[[VAL_25]]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:               scf.yield %[[VAL_42]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:             }
+// CHECK:             scf.yield %[[VAL_44:.*]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           }
+// CHECK:           %[[VAL_45:.*]] = sparse_tensor.load %[[VAL_46:.*]] hasInserts : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           return %[[VAL_45]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+func.func @mul_affine_sparse2d(%arga: tensor<32x16xf64, #CSR>,
+                              %argb: tensor<34x19xf64, #CSR>) -> tensor<32x16xf64, #CSR> {
+  %argx = bufferization.alloc_tensor() : tensor<32x16xf64, #CSR>
+  %0 = linalg.generic #trait3
+     ins(%arga, %argb: tensor<32x16xf64, #CSR>, tensor<34x19xf64, #CSR>)
+    outs(%argx: tensor<32x16xf64, #CSR>) {
+      ^bb(%a: f64, %b: f64, %x: f64):
+        %0 = arith.mulf %a, %b : f64
+        %1 = arith.addf %x, %0 : f64
+        linalg.yield %1 : f64
+  } -> tensor<32x16xf64, #CSR>
+  return %0 : tensor<32x16xf64, #CSR>
+}
+
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
@@ -0,0 +1,115 @@
+// RUN: mlir-opt %s --sparse-compiler=enable-runtime-library=true | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+// RUN: mlir-opt %s --sparse-compiler="enable-runtime-library=false enable-buffer-initialization=true" | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+#CCC = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "compressed", "compressed" ] }>
+
+#CDC = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "dense", "compressed" ]
+  // FIXME: Still inadmissible might need investigation
+  // dimOrdering = affine_map<(i,j,k) -> (j,k,i)>
+}>
+
+// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
+func.func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> tensor<?x?x?xf32> {
+  %buf = bufferization.alloc_tensor(%s1, %s2, %s3) : tensor<?x?x?xf32>
+  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  return %ret : tensor<?x?x?xf32>
+}
+
+func.func @conv_1d_nwc_wcf(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>, %arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  %ret = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>,
+                                   strides = dense<1> : tensor<1xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?xf32>, tensor<?x?x?xf32>)
+    outs (%arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  return %ret : tensor<?x?x?xf32>
+}
+
+func.func @conv_1d_nwc_wcf_CCC(%arg0: tensor<?x?x?xf32, #CCC>, %arg1: tensor<?x?x?xf32, #CCC>) -> tensor<?x?x?xf32, #CCC> {
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c6 = arith.constant 6 : index
+  %s = bufferization.alloc_tensor(%c3, %c6, %c1) : tensor<?x?x?xf32, #CCC>
+  %ret = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>,
+                                   strides = dense<1> : tensor<1xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?xf32, #CCC>, tensor<?x?x?xf32, #CCC>)
+    outs (%s: tensor<?x?x?xf32, #CCC>) -> tensor<?x?x?xf32, #CCC>
+  return %ret : tensor<?x?x?xf32, #CCC>
+}
+
+func.func @conv_1d_nwc_wcf_CDC(%arg0: tensor<?x?x?xf32, #CDC>, %arg1: tensor<?x?x?xf32, #CDC>) -> tensor<?x?x?xf32, #CDC> {
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c6 = arith.constant 6 : index
+  %s = bufferization.alloc_tensor(%c3, %c6, %c1) : tensor<?x?x?xf32, #CDC>
+  %ret = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>,
+                                   strides = dense<1> : tensor<1xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?xf32, #CDC>, tensor<?x?x?xf32, #CDC>)
+    outs (%s: tensor<?x?x?xf32, #CDC>) -> tensor<?x?x?xf32, #CDC>
+  return %ret : tensor<?x?x?xf32, #CDC>
+}
+
+func.func @entry() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c6 = arith.constant 6 : index
+  %c8 = arith.constant 8 : index
+  %f10 = arith.constant 10.00000e+00 : f32
+  %val = arith.constant 2.00000e+00 : f32
+  %zero = arith.constant 0.00000e+00 : f32
+
+  %in1D_tmp = call @alloc_3d_filled_f32(%c3, %c8, %c1, %val) : (index, index, index, f32) -> (tensor<?x?x?xf32>)
+  %in1D_nwc = tensor.insert %f10 into %in1D_tmp[%c0, %c3, %c0] : tensor<?x?x?xf32>
+  %filter1D_nwc = call @alloc_3d_filled_f32(%c3, %c1, %c1, %val) : (index, index, index, f32) -> (tensor<?x?x?xf32>)
+  %out1D_nwc = call @alloc_3d_filled_f32(%c3, %c6, %c1, %zero) : (index, index, index, f32) -> (tensor<?x?x?xf32>)
+
+  %in1D_nwc_CCC = sparse_tensor.convert %in1D_nwc
+    : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CCC>
+  %filter1D_nwc_CCC = sparse_tensor.convert %filter1D_nwc
+    : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CCC>
+
+  %in1D_nwc_CDC = sparse_tensor.convert %in1D_nwc
+    : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CDC>
+  %filter1D_nwc_CDC = sparse_tensor.convert %filter1D_nwc
+    : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CDC>
+
+  %dense_ret = call @conv_1d_nwc_wcf(%in1D_nwc, %filter1D_nwc, %out1D_nwc) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>)
+  %CCC_ret = call @conv_1d_nwc_wcf_CCC(%in1D_nwc_CCC, %filter1D_nwc_CCC) : (tensor<?x?x?xf32, #CCC>, tensor<?x?x?xf32, #CCC>) -> (tensor<?x?x?xf32, #CCC>)
+  %CDC_ret = call @conv_1d_nwc_wcf_CDC(%in1D_nwc_CDC, %filter1D_nwc_CDC) : (tensor<?x?x?xf32, #CDC>, tensor<?x?x?xf32, #CDC>) -> (tensor<?x?x?xf32, #CDC>)
+
+  //      CHECK: ( ( ( 12 ), ( 28 ), ( 28 ), ( 28 ), ( 12 ), ( 12 ) ),
+  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ),
+  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ) )
+  %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0], %zero
+      : tensor<?x?x?xf32>, vector<3x6x1xf32>
+  vector.print %dense_v : vector<3x6x1xf32>
+
+  //      CHECK: ( ( ( 12 ), ( 28 ), ( 28 ), ( 28 ), ( 12 ), ( 12 ) ),
+  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ),
+  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ) )
+  %1 = sparse_tensor.convert %CCC_ret
+    : tensor<?x?x?xf32, #CCC> to tensor<?x?x?xf32>
+  %v1 = vector.transfer_read %1[%c0, %c0, %c0], %zero
+      : tensor<?x?x?xf32>, vector<3x6x1xf32>
+  vector.print %v1 : vector<3x6x1xf32>
+
+  //      CHECK: ( ( ( 12 ), ( 28 ), ( 28 ), ( 28 ), ( 12 ), ( 12 ) ),
+  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ),
+  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ) )
+  %2 = sparse_tensor.convert %CDC_ret
+    : tensor<?x?x?xf32, #CDC> to tensor<?x?x?xf32>
+  %v2 = vector.transfer_read %2[%c0, %c0, %c0], %zero
+      : tensor<?x?x?xf32>, vector<3x6x1xf32>
+  vector.print %v2 : vector<3x6x1xf32>
+
+  return
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
@@ -0,0 +1,209 @@
+// RUN: mlir-opt %s --sparse-compiler=enable-runtime-library=true | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+// RUN: mlir-opt %s --sparse-compiler="enable-runtime-library=false enable-buffer-initialization=true" | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+#DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>
+#CSR = #sparse_tensor.encoding<{dimLevelType = ["dense", "compressed"]}>
+#CSC = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "compressed" ],
+  dimOrdering = affine_map<(i,j) -> (j,i)>
+}>
+
+// An example of a 2D convolution with a sparse filter.
+module {
+
+  func.func @conv2d(%input:  tensor<8x8xi32>,
+               %filter: tensor<3x3xi32, #DCSR>,
+               %output: tensor<6x6xi32>) -> tensor<6x6xi32> {
+    %0 = linalg.conv_2d
+      ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
+      outs (%output: tensor<6x6xi32>) -> tensor<6x6xi32>
+    return %0 : tensor<6x6xi32>
+  }
+
+  func.func @conv2d_sparse_out(%input:  tensor<8x8xi32>,
+               %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> {
+    %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR>           
+    %0 = linalg.conv_2d
+      ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
+      outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
+    return %0 : tensor<6x6xi32, #DCSR>
+  }
+
+  func.func @conv2d_all_sparse_DCSR(%input:  tensor<8x8xi32, #DCSR>,
+               %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> {
+    %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR>           
+    %0 = linalg.conv_2d
+      ins  (%input, %filter: tensor<8x8xi32, #DCSR>, tensor<3x3xi32, #DCSR>)
+      outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
+    return %0 : tensor<6x6xi32, #DCSR>
+  }
+
+  func.func @conv2d_all_sparse_CSR(%input:  tensor<8x8xi32, #CSR>,
+               %filter: tensor<3x3xi32, #CSR>) -> tensor<6x6xi32, #CSR> {
+    %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSR>           
+    %0 = linalg.conv_2d
+      ins  (%input, %filter: tensor<8x8xi32, #CSR>, tensor<3x3xi32, #CSR>)
+      outs (%s: tensor<6x6xi32, #CSR>) -> tensor<6x6xi32, #CSR>
+    return %0 : tensor<6x6xi32, #CSR>
+  }
+  
+  func.func @conv2d_all_sparse_CSC(%input:  tensor<8x8xi32, #CSC>,
+               %filter: tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC> {
+    %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSC>           
+    %0 = linalg.conv_2d
+      ins  (%input, %filter: tensor<8x8xi32, #CSC>, tensor<3x3xi32, #CSC>)
+      outs (%s: tensor<6x6xi32, #CSC>) -> tensor<6x6xi32, #CSC>
+    return %0 : tensor<6x6xi32, #CSC>
+  }
+
+  func.func @entry() {
+    %c0 = arith.constant 0 : index
+    %i0 = arith.constant 0 : i32
+
+    // A typical edge detection filter.
+    %filter = arith.constant dense<[
+      [  1,  0, -1 ],
+      [  0,  0,  0 ],
+      [ -1,  0,  1 ]
+    ]> : tensor<3x3xi32>
+    %sparse_filter_DCSR = sparse_tensor.convert %filter
+      : tensor<3x3xi32> to tensor<3x3xi32, #DCSR>
+    %sparse_filter_CSR = sparse_tensor.convert %filter
+      : tensor<3x3xi32> to tensor<3x3xi32, #CSR>
+    %sparse_filter_CSC = sparse_tensor.convert %filter
+      : tensor<3x3xi32> to tensor<3x3xi32, #CSC>
+
+
+    %input = arith.constant dense<[
+      [  1,  2,  3,  4,  0,  6,  7,  8 ],
+      [  2,  2,  4,  4,  0,  0,  6,  8 ],
+      [  2,  2,  4,  4,  0,  0,  6,  8 ],
+      [  2,  2,  3,  4,  0,  0,  7,  8 ],
+      [  1,  3,  3,  4,  0,  0,  6,  8 ],
+      [  3,  2,  3,  4,  0,  0,  7,  8 ],
+      [  1,  3,  3,  4,  3,  6,  6,  8 ],
+      [  1,  3,  3,  4,  3,  0,  7,  8 ]
+    ]> : tensor<8x8xi32>
+    %sparse_input_DCSR = sparse_tensor.convert %input
+      : tensor<8x8xi32> to tensor<8x8xi32, #DCSR>
+    %sparse_input_CSR = sparse_tensor.convert %input
+      : tensor<8x8xi32> to tensor<8x8xi32, #CSR>
+    %sparse_input_CSC = sparse_tensor.convert %input
+      : tensor<8x8xi32> to tensor<8x8xi32, #CSC>
+      
+    // Call the kernel.
+    %output = arith.constant dense<0> : tensor<6x6xi32>
+    %0 = call @conv2d(%input, %sparse_filter_DCSR, %output)
+       : (tensor<8x8xi32>,
+          tensor<3x3xi32, #DCSR>, tensor<6x6xi32>) -> tensor<6x6xi32>
+    %1 = call @conv2d_sparse_out(%input, %sparse_filter_DCSR)
+       : (tensor<8x8xi32>,
+          tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
+    %2 = call @conv2d_all_sparse_DCSR(%sparse_input_DCSR, %sparse_filter_DCSR)
+       : (tensor<8x8xi32, #DCSR>,
+          tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
+    %3 = call @conv2d_all_sparse_CSR(%sparse_input_CSR, %sparse_filter_CSR)
+       : (tensor<8x8xi32, #CSR>,
+          tensor<3x3xi32, #CSR>) -> tensor<6x6xi32, #CSR>
+    %4 = call @conv2d_all_sparse_CSC(%sparse_input_CSC, %sparse_filter_CSC)
+       : (tensor<8x8xi32, #CSC>,
+          tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC>
+          
+ 
+    // Verify the output.
+    //
+    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
+    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
+    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
+    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
+    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    //
+    %v = vector.transfer_read %0[%c0, %c0], %i0
+      : tensor<6x6xi32>, vector<6x6xi32>
+    vector.print %v : vector<6x6xi32>
+
+    //
+    // Should be the same as dense output
+    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
+    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
+    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
+    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
+    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    //
+    %sparse_ret = sparse_tensor.convert %1
+      : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
+    %v1 = vector.transfer_read %sparse_ret[%c0, %c0], %i0
+      : tensor<6x6xi32>, vector<6x6xi32>
+    vector.print %v1 : vector<6x6xi32>
+    
+    //
+    // Should be the same as dense output
+    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
+    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
+    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
+    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
+    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    //
+    %all_sparse_DCSR = sparse_tensor.convert %2
+      : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
+    %v2 = vector.transfer_read %all_sparse_DCSR[%c0, %c0], %i0
+      : tensor<6x6xi32>, vector<6x6xi32>
+    vector.print %v2 : vector<6x6xi32>
+    
+    //
+    // Should be the same as dense output
+    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
+    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
+    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
+    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
+    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    //
+    %all_sparse_CSR = sparse_tensor.convert %3
+      : tensor<6x6xi32, #CSR> to tensor<6x6xi32>
+    %v3 = vector.transfer_read %all_sparse_CSR[%c0, %c0], %i0
+      : tensor<6x6xi32>, vector<6x6xi32>
+    vector.print %v3 : vector<6x6xi32>
+    
+    //
+    // Should be the same as dense output
+    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
+    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
+    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
+    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
+    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    //
+    %all_sparse_CSC = sparse_tensor.convert %4
+      : tensor<6x6xi32, #CSC> to tensor<6x6xi32>
+    %v4 = vector.transfer_read %all_sparse_CSC[%c0, %c0], %i0
+      : tensor<6x6xi32>, vector<6x6xi32>
+    vector.print %v4 : vector<6x6xi32>
+    
+    // Release the resources.
+    bufferization.dealloc_tensor %sparse_filter_DCSR : tensor<3x3xi32, #DCSR>
+    bufferization.dealloc_tensor %sparse_filter_CSR : tensor<3x3xi32, #CSR>
+    bufferization.dealloc_tensor %sparse_filter_CSC : tensor<3x3xi32, #CSC>
+    
+    bufferization.dealloc_tensor %sparse_input_DCSR : tensor<8x8xi32, #DCSR>
+    bufferization.dealloc_tensor %sparse_input_CSR : tensor<8x8xi32, #CSR>
+    bufferization.dealloc_tensor %sparse_input_CSC : tensor<8x8xi32, #CSC>
+
+    bufferization.dealloc_tensor %1 : tensor<6x6xi32, #DCSR>
+    bufferization.dealloc_tensor %2 : tensor<6x6xi32, #DCSR>
+    bufferization.dealloc_tensor %3 : tensor<6x6xi32, #CSR>
+    bufferization.dealloc_tensor %4 : tensor<6x6xi32, #CSC>
+    return
+  }
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
@@ -0,0 +1,158 @@
+// RUN: mlir-opt %s --sparse-compiler=enable-runtime-library=true | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+// RUN: mlir-opt %s --sparse-compiler="enable-runtime-library=false enable-buffer-initialization=true" | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+#CCCC = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "compressed", "compressed", "compressed" ]
+}>
+
+#CDCD = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "dense", "compressed", "dense" ]
+}>
+
+// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
+func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
+  %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor<?x?x?x?xf32>
+  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nhwc_hwcf(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
+                                     strides = dense<1> : tensor<2xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
+    outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nhwc_hwcf_CCCC(%arg0: tensor<?x?x?x?xf32, #CCCC>, %arg1: tensor<?x?x?x?xf32, #CCCC>) -> tensor<?x?x?x?xf32, #CCCC> {
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c6 = arith.constant 6 : index
+  %s = bufferization.alloc_tensor(%c3, %c6, %c6, %c1) : tensor<?x?x?x?xf32, #CCCC>
+  %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
+                                     strides = dense<1> : tensor<2xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32, #CCCC>)
+    outs (%s: tensor<?x?x?x?xf32, #CCCC>) -> tensor<?x?x?x?xf32, #CCCC>
+  return %ret : tensor<?x?x?x?xf32, #CCCC>
+}
+
+func.func @conv_2d_nhwc_hwcf_CDCD(%arg0: tensor<?x?x?x?xf32, #CDCD>, %arg1: tensor<?x?x?x?xf32, #CDCD>) -> tensor<?x?x?x?xf32, #CDCD> {
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c6 = arith.constant 6 : index
+  %s = bufferization.alloc_tensor(%c3, %c6, %c6, %c1) : tensor<?x?x?x?xf32, #CDCD>
+  %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
+                                     strides = dense<1> : tensor<2xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CDCD>, tensor<?x?x?x?xf32, #CDCD>)
+    outs (%s: tensor<?x?x?x?xf32, #CDCD>) -> tensor<?x?x?x?xf32, #CDCD>
+  return %ret : tensor<?x?x?x?xf32, #CDCD>
+}
+
+func.func @entry() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c6 = arith.constant 6 : index
+  %c8 = arith.constant 8 : index
+  %f10 = arith.constant 10.00000e+00 : f32
+  %val = arith.constant 2.00000e+00 : f32
+  %zero = arith.constant 0.00000e+00 : f32
+
+  %filter2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+  %in2D_tmp = call @alloc_4d_filled_f32(%c3, %c8, %c8, %c3, %val) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+  %in2D_nhwc = tensor.insert %f10 into %in2D_tmp[%c0, %c0, %c3, %c0] : tensor<?x?x?x?xf32>
+  %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c6, %c6, %c1, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+
+  %in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc
+    : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CCCC>
+  %filter2D_nhwc_CCCC = sparse_tensor.convert %filter2D_nhwc
+    : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CCCC>
+
+  %in2D_nhwc_CDCD = sparse_tensor.convert %in2D_nhwc
+    : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CDCD>
+  %filter2D_nhwc_CDCD = sparse_tensor.convert %filter2D_nhwc
+    : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CDCD>
+
+  %dense_ret = call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
+  %CCCC_ret = call @conv_2d_nhwc_hwcf_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc_CCCC) : (tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32, #CCCC>) -> (tensor<?x?x?x?xf32, #CCCC>)
+  %CDCD_ret = call @conv_2d_nhwc_hwcf_CDCD(%in2D_nhwc_CDCD, %filter2D_nhwc_CDCD) : (tensor<?x?x?x?xf32, #CDCD>, tensor<?x?x?x?xf32, #CDCD>) -> (tensor<?x?x?x?xf32, #CDCD>)
+  
+  // CHECK:     ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) )
+  %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero
+      : tensor<?x?x?x?xf32>, vector<3x6x6x1xf32>
+  vector.print %dense_v : vector<3x6x6x1xf32>
+
+  // CHECK:     ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) )
+  %1 = sparse_tensor.convert %CCCC_ret
+    : tensor<?x?x?x?xf32, #CCCC> to tensor<?x?x?x?xf32>
+  %v1 = vector.transfer_read %1[%c0, %c0, %c0, %c0], %zero
+      : tensor<?x?x?x?xf32>, vector<3x6x6x1xf32>
+  vector.print %v1 : vector<3x6x6x1xf32>
+
+  // CHECK:     ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) )
+  %2 = sparse_tensor.convert %CDCD_ret
+    : tensor<?x?x?x?xf32, #CDCD> to tensor<?x?x?x?xf32>
+  %v2 = vector.transfer_read %2[%c0, %c0, %c0, %c0], %zero
+      : tensor<?x?x?x?xf32>, vector<3x6x6x1xf32>
+  vector.print %v2 : vector<3x6x6x1xf32>
+  return
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
@@ -0,0 +1,206 @@
+// RUN: mlir-opt %s --sparse-compiler=enable-runtime-library=true | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+// RUN: mlir-opt %s --sparse-compiler="enable-runtime-library=false enable-buffer-initialization=true" | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+#CCC = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "compressed", "compressed" ]
+}>
+
+#CDC = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "dense", "compressed" ]
+}>
+
+// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
+func.func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> tensor<?x?x?xf32> {
+  %buf = bufferization.alloc_tensor(%s1, %s2, %s3) : tensor<?x?x?xf32>
+  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  return %ret : tensor<?x?x?xf32>
+}
+
+func.func @conv_3d(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>, %arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  %ret = linalg.conv_3d
+     ins (%arg0, %arg1: tensor<?x?x?xf32>, tensor<?x?x?xf32>)
+    outs (%arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  return %ret : tensor<?x?x?xf32>
+}
+
+func.func @conv_3d_CCC(%arg0: tensor<?x?x?xf32, #CCC>, %arg1: tensor<?x?x?xf32, #CCC>) -> tensor<?x?x?xf32, #CCC> {
+  %c6 = arith.constant 6 : index
+  %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor<?x?x?xf32, #CCC>
+  %ret = linalg.conv_3d
+     ins (%arg0, %arg1: tensor<?x?x?xf32, #CCC>, tensor<?x?x?xf32, #CCC>)
+    outs (%s: tensor<?x?x?xf32, #CCC>) -> tensor<?x?x?xf32, #CCC>
+  return %ret : tensor<?x?x?xf32, #CCC>
+}
+
+func.func @conv_3d_CDC(%arg0: tensor<?x?x?xf32, #CDC>, %arg1: tensor<?x?x?xf32, #CDC>) -> tensor<?x?x?xf32, #CDC> {
+  %c6 = arith.constant 6 : index
+  %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor<?x?x?xf32, #CDC>
+  %ret = linalg.conv_3d
+     ins (%arg0, %arg1: tensor<?x?x?xf32, #CDC>, tensor<?x?x?xf32, #CDC>)
+    outs (%s: tensor<?x?x?xf32, #CDC>) -> tensor<?x?x?xf32, #CDC>
+  return %ret : tensor<?x?x?xf32, #CDC>
+}
+
+func.func @entry() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c6 = arith.constant 6 : index
+  %c8 = arith.constant 8 : index
+  %f10 = arith.constant 10.00000e+00 : f32
+  %val = arith.constant 2.00000e+00 : f32
+  %zero = arith.constant 0.00000e+00 : f32
+
+  %filter3D = call @alloc_3d_filled_f32(%c3, %c3, %c3, %val) : (index, index, index, f32) -> (tensor<?x?x?xf32>)
+  %in3D_tmp = call @alloc_3d_filled_f32(%c8, %c8, %c8, %val) : (index, index, index, f32) -> (tensor<?x?x?xf32>)
+  %in3D = tensor.insert %f10 into %in3D_tmp[%c0, %c3, %c0] : tensor<?x?x?xf32>
+  %out3D = call @alloc_3d_filled_f32(%c6, %c6, %c6, %zero) : (index, index, index, f32) -> (tensor<?x?x?xf32>)
+
+  %in3D_CCC = sparse_tensor.convert %in3D
+    : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CCC>
+  %filter3D_CCC = sparse_tensor.convert %filter3D
+    : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CCC>
+
+  %in3D_CDC = sparse_tensor.convert %in3D
+    : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CDC>
+  %filter3D_CDC = sparse_tensor.convert %filter3D
+    : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CDC>
+
+  %dense_ret = call @conv_3d(%in3D, %filter3D, %out3D) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>)
+  %CCC_ret = call @conv_3d_CCC(%in3D_CCC, %filter3D_CCC) : (tensor<?x?x?xf32, #CCC>, tensor<?x?x?xf32, #CCC>) -> (tensor<?x?x?xf32, #CCC>)
+  %CDC_ret = call @conv_3d_CDC(%in3D_CDC, %filter3D_CDC) : (tensor<?x?x?xf32, #CDC>, tensor<?x?x?xf32, #CDC>) -> (tensor<?x?x?xf32, #CDC>)
+
+  //      CHECK:( ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
+  %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0], %zero
+      : tensor<?x?x?xf32>, vector<6x6x6xf32>
+  vector.print %dense_v : vector<6x6x6xf32>
+
+  // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
+  %1 = sparse_tensor.convert %CCC_ret
+    : tensor<?x?x?xf32, #CCC> to tensor<?x?x?xf32>
+  %v1 = vector.transfer_read %1[%c0, %c0, %c0], %zero
+      : tensor<?x?x?xf32>, vector<6x6x6xf32>
+  vector.print %v1 : vector<6x6x6xf32>
+
+  // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
+  %2 = sparse_tensor.convert %CCC_ret
+    : tensor<?x?x?xf32, #CCC> to tensor<?x?x?xf32>
+  %v2 = vector.transfer_read %2[%c0, %c0, %c0], %zero
+      : tensor<?x?x?xf32>, vector<6x6x6xf32>
+  vector.print %v2 : vector<6x6x6xf32>
+
+  return
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
@@ -0,0 +1,226 @@
+// RUN: mlir-opt %s --sparse-compiler=enable-runtime-library=true | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+// RUN: mlir-opt %s --sparse-compiler="enable-runtime-library=false enable-buffer-initialization=true" | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+#CCCCC = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "compressed", "compressed", "compressed", "compressed" ]
+}>
+
+#CDCDC = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "dense", "compressed", "dense", "compressed"]
+}>
+
+// Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f
+func.func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> tensor<?x?x?x?x?xf32> {
+  %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4, %s5) : tensor<?x?x?x?x?xf32>
+  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?x?xf32>
+}
+
+func.func @conv_3d_ndhwc_dhwcf(%arg0: tensor<?x?x?x?x?xf32>,
+                               %arg1: tensor<?x?x?x?x?xf32>,
+                               %arg2: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32> {
+  %ret = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>,
+                                       strides = dense<1> : tensor<3xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
+    outs (%arg2: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?x?xf32>
+}
+
+func.func @conv_3d_ndhwc_dhwcf_CCCCC(%arg0: tensor<?x?x?x?x?xf32, #CCCCC>,
+                                     %arg1: tensor<?x?x?x?x?xf32, #CCCCC>)
+                                     -> tensor<?x?x?x?x?xf32, #CCCCC> {
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  %s = bufferization.alloc_tensor(%c1, %c6, %c6, %c6, %c1)
+    : tensor<?x?x?x?x?xf32, #CCCCC>
+  %ret = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>,
+                                       strides = dense<1> : tensor<3xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?x?x?xf32, #CCCCC>, tensor<?x?x?x?x?xf32, #CCCCC>)
+    outs (%s: tensor<?x?x?x?x?xf32, #CCCCC>) -> tensor<?x?x?x?x?xf32, #CCCCC>
+  return %ret : tensor<?x?x?x?x?xf32, #CCCCC>
+}
+
+func.func @conv_3d_ndhwc_dhwcf_CDCDC(%arg0: tensor<?x?x?x?x?xf32, #CDCDC>,
+                                     %arg1: tensor<?x?x?x?x?xf32, #CDCDC>)
+                                     -> tensor<?x?x?x?x?xf32, #CDCDC> {
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  %s = bufferization.alloc_tensor(%c1, %c6, %c6, %c6, %c1)
+    : tensor<?x?x?x?x?xf32, #CDCDC>
+  %ret = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>,
+                                       strides = dense<1> : tensor<3xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?x?x?xf32, #CDCDC>, tensor<?x?x?x?x?xf32, #CDCDC>)
+    outs (%s: tensor<?x?x?x?x?xf32, #CDCDC>) -> tensor<?x?x?x?x?xf32, #CDCDC>
+  return %ret : tensor<?x?x?x?x?xf32, #CDCDC>
+}
+
+func.func @entry() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c6 = arith.constant 6 : index
+  %c8 = arith.constant 8 : index
+  %f10 = arith.constant 10.00000e+00 : f32
+  %val = arith.constant 2.00000e+00 : f32
+  %zero = arith.constant 0.00000e+00 : f32
+  
+  %in3D_tmp = call @alloc_5d_filled_f32(%c1, %c8, %c8, %c8, %c1, %val) : (index, index, index, index, index, f32) -> (tensor<?x?x?x?x?xf32>)
+  %in3D_ndhwc = tensor.insert %f10 into %in3D_tmp[%c0, %c0, %c0, %c3, %c0] : tensor<?x?x?x?x?xf32>
+  
+  %filter3D_ndhwc = call @alloc_5d_filled_f32(%c3, %c3, %c3, %c1, %c1, %val) : (index, index, index, index, index, f32) -> (tensor<?x?x?x?x?xf32>)
+  %out3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c6, %c6, %c6, %c1, %zero) : (index, index, index, index, index, f32) -> (tensor<?x?x?x?x?xf32>)
+
+  %in3D_ndhwc_CCCCC = sparse_tensor.convert %in3D_ndhwc
+    : tensor<?x?x?x?x?xf32> to tensor<?x?x?x?x?xf32, #CCCCC>
+  %filter3D_ndhwc_CCCCC = sparse_tensor.convert %filter3D_ndhwc
+    : tensor<?x?x?x?x?xf32> to tensor<?x?x?x?x?xf32, #CCCCC>
+
+  %in3D_ndhwc_CDCDC = sparse_tensor.convert %in3D_ndhwc
+    : tensor<?x?x?x?x?xf32> to tensor<?x?x?x?x?xf32, #CDCDC>
+  %filter3D_ndhwc_CDCDC = sparse_tensor.convert %filter3D_ndhwc
+    : tensor<?x?x?x?x?xf32> to tensor<?x?x?x?x?xf32, #CDCDC>
+    
+  //      CHECK:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) )
+  %dense_ret = call @conv_3d_ndhwc_dhwcf(%in3D_ndhwc, %filter3D_ndhwc, %out3D_ndhwc)
+      : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>) -> (tensor<?x?x?x?x?xf32>)
+  %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0, %c0], %zero
+      : tensor<?x?x?x?x?xf32>, vector<1x6x6x6x1xf32>
+  vector.print %dense_v : vector<1x6x6x6x1xf32>
+
+  %CCCCC_ret = call @conv_3d_ndhwc_dhwcf_CCCCC(%in3D_ndhwc_CCCCC, %filter3D_ndhwc_CCCCC)
+      : (tensor<?x?x?x?x?xf32, #CCCCC>,
+         tensor<?x?x?x?x?xf32, #CCCCC>) -> (tensor<?x?x?x?x?xf32, #CCCCC>)
+
+  // CHECK-NEXT:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) )
+  %1 = sparse_tensor.convert %CCCCC_ret
+    : tensor<?x?x?x?x?xf32, #CCCCC> to tensor<?x?x?x?x?xf32>         
+  %v1 = vector.transfer_read %1[%c0, %c0, %c0, %c0, %c0], %zero
+      : tensor<?x?x?x?x?xf32>, vector<1x6x6x6x1xf32>
+  vector.print %v1 : vector<1x6x6x6x1xf32>
+
+  %CDCDC_ret = call @conv_3d_ndhwc_dhwcf_CDCDC(%in3D_ndhwc_CDCDC, %filter3D_ndhwc_CDCDC)
+      : (tensor<?x?x?x?x?xf32, #CDCDC>,
+         tensor<?x?x?x?x?xf32, #CDCDC>) -> (tensor<?x?x?x?x?xf32, #CDCDC>)
+
+  // CHECK-NEXT:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
+  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
+  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) )
+  %2 = sparse_tensor.convert %CDCDC_ret
+    : tensor<?x?x?x?x?xf32, #CDCDC> to tensor<?x?x?x?x?xf32>
+  %v2 = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0, %c0], %zero
+      : tensor<?x?x?x?x?xf32>, vector<1x6x6x6x1xf32>
+  vector.print %v2 : vector<1x6x6x6x1xf32>
+
+  return
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
deleted file mode 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
+++ /dev/null
@@ -1,95 +0,0 @@
-// RUN: mlir-opt %s --sparse-compiler | \
-// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
-
-#DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>
-
-// An example of a 2D convolution with a sparse filter.
-module {
-
-  func.func @conv2d(%input:  tensor<8x8xi32>,
-               %filter: tensor<3x3xi32, #DCSR>,
-               %output: tensor<6x6xi32>) -> tensor<6x6xi32> {
-    %0 = linalg.conv_2d
-      ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
-      outs (%output: tensor<6x6xi32>) -> tensor<6x6xi32>
-    return %0 : tensor<6x6xi32>
-  }
-
-  func.func @conv2d_sparse_out(%input:  tensor<8x8xi32>,
-               %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> {
-    %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR>           
-    %0 = linalg.conv_2d
-      ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
-      outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
-    return %0 : tensor<6x6xi32, #DCSR>
-  }
-
-  func.func @entry() {
-    %c0 = arith.constant 0 : index
-    %i0 = arith.constant 0 : i32
-
-    // A typical edge detection filter.
-    %filter = arith.constant dense<[
-      [  1,  0, -1 ],
-      [  0,  0,  0 ],
-      [ -1,  0,  1 ]
-    ]> : tensor<3x3xi32>
-    %sparse_filter = sparse_tensor.convert %filter
-      : tensor<3x3xi32> to tensor<3x3xi32, #DCSR>
-
-    %input = arith.constant dense<[
-      [  1,  2,  3,  4,  0,  6,  7,  8 ],
-      [  2,  2,  4,  4,  0,  0,  6,  8 ],
-      [  2,  2,  4,  4,  0,  0,  6,  8 ],
-      [  2,  2,  3,  4,  0,  0,  7,  8 ],
-      [  1,  3,  3,  4,  0,  0,  6,  8 ],
-      [  3,  2,  3,  4,  0,  0,  7,  8 ],
-      [  1,  3,  3,  4,  3,  6,  6,  8 ],
-      [  1,  3,  3,  4,  3,  0,  7,  8 ]
-    ]> : tensor<8x8xi32>
-
-    // Call the kernel.
-    %output = arith.constant dense<0> : tensor<6x6xi32>
-    %0 = call @conv2d(%input, %sparse_filter, %output)
-       : (tensor<8x8xi32>,
-          tensor<3x3xi32, #DCSR>, tensor<6x6xi32>) -> tensor<6x6xi32>
-    %1 = call @conv2d_sparse_out(%input, %sparse_filter)
-       : (tensor<8x8xi32>,
-          tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
- 
-    // Verify the output.
-    //
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
-    //
-    %v = vector.transfer_read %0[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v : vector<6x6xi32>
-
-    //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
-    //
-    %sparse_ret = sparse_tensor.convert %1
-      : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
-    %v1 = vector.transfer_read %sparse_ret[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v1 : vector<6x6xi32>
-
-    // Release the resources.
-    bufferization.dealloc_tensor %sparse_filter : tensor<3x3xi32, #DCSR>
-    bufferization.dealloc_tensor %1 : tensor<6x6xi32, #DCSR>
-    return
-  }
-}
diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -127,7 +127,7 @@
 protected:
   MergerTestBase(unsigned numTensors, unsigned numLoops)
       : numTensors(numTensors), numLoops(numLoops),
-        merger(numTensors, numLoops) {}
+        merger(numTensors, numLoops, /*numFilterLoops=*/0) {}
 
   ///
   /// Expression construction helpers.