diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -153,11 +153,15 @@
   /// last tensor in this set denoting the output tensor. The merger adds an
   /// additional synthetic tensor at the end of this set to represent all
   /// invariant expressions in the kernel.
-  Merger(unsigned t, unsigned l)
-      : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1), numLoops(l),
-        hasSparseOut(false),
-        dimTypes(t + 1, std::vector<DimLevelType>(l, DimLevelType::Undef)),
-        loopIdxToDim(t + 1, std::vector<Optional<unsigned>>(l, llvm::None)) {}
+  Merger(unsigned t, unsigned l, unsigned fl)
+      : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1),
+        numNativeLoops(l), numLoops(l + fl), hasSparseOut(false),
+        dimTypes(numTensors,
+                 std::vector<DimLevelType>(numLoops, DimLevelType::Undef)),
+        loopIdxToDim(numTensors,
+                     std::vector<Optional<unsigned>>(numLoops, llvm::None)),
+        dimToLoopIdx(numTensors,
+                     std::vector<Optional<unsigned>>(numLoops, llvm::None)) {}
 
   /// Adds a tensor expression. Returns its index.
   unsigned addExp(Kind k, unsigned e0, unsigned e1 = -1u, Value v = Value(),
@@ -227,6 +231,13 @@
   /// Bit translation (get loop index).
   unsigned index(unsigned b) const { return b / numTensors; }
 
+  /// Get the number of total loops (native loops + filter loops).
+  unsigned getNumLoops() const { return numLoops; }
+  /// Get the number of native loops.
+  unsigned getNumNativeLoops() const { return numNativeLoops; }
+  /// Get the starting filter loop index.
+  unsigned getFilterLoopStartingIdx() const { return getNumNativeLoops(); }
+
   /// Returns true if bit corresponds to index of output tensor.
   bool isOutTensor(unsigned b, unsigned i) const {
     return tensor(b) == outTensor && index(b) == i;
@@ -238,6 +249,11 @@
   /// expressions).
   unsigned getSynTensorID() const { return syntheticTensor; }
 
+  bool isFilterLoop(unsigned ldx) const {
+    assert(ldx < numLoops);
+    return ldx >= numNativeLoops;
+  }
+
   /// Returns true if given tensor iterates *only* in the given tensor
   /// expression. For the output tensor, this defines a "simply dynamic"
   /// operation [Bik96]. For instance: a(i) *= 2.0 or a(i) += a(i) for
@@ -258,6 +274,11 @@
     return getDimLevelType(tensor(b), index(b));
   }
 
+  Optional<unsigned> getLoopIdx(unsigned t, unsigned dim) const {
+    assert(t < numTensors && dim < numLoops);
+    return dimToLoopIdx[t][dim];
+  }
+
   /// Gets the dimension number of the the `t`th tensor on `i`th loop.
   Optional<unsigned> getDimNum(unsigned t, unsigned i) const {
     assert(t < numTensors && i < numLoops);
@@ -276,6 +297,8 @@
     assert(isValidDLT(dlt));
     dimTypes[t][i] = dlt;
     loopIdxToDim[t][i] = dim;
+    assert(dim < numLoops);
+    dimToLoopIdx[t][dim] = i;
   }
 
   // Iterates the bits of a lattice, for each set bit, converts it into the
@@ -334,6 +357,7 @@
   const unsigned outTensor;
   const unsigned syntheticTensor;
   const unsigned numTensors;
+  const unsigned numNativeLoops;
   const unsigned numLoops;
   bool hasSparseOut;
   // Map that converts pair<tensor id, loop id> to the corresponding dimension
@@ -341,6 +365,8 @@
   std::vector<std::vector<DimLevelType>> dimTypes;
   // Map that converts pair<tensor id, loop id> to the corresponding dimension.
   std::vector<std::vector<Optional<unsigned>>> loopIdxToDim;
+  // Map that converts pair<tensor id, dim> to the corresponding loop id.
+  std::vector<std::vector<Optional<unsigned>>> dimToLoopIdx;
   llvm::SmallVector<TensorExp, 32> tensorExps;
   llvm::SmallVector<LatPoint, 16> latPoints;
   llvm::SmallVector<SmallVector<unsigned, 16>, 8> latSets;
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -329,7 +329,8 @@
   /// If isSparseOut is set, loop emitter assume that the sparse output tensor
   /// is empty, and will always generate loops on it based on the dim sizes.
   explicit SparseTensorLoopEmitter(ValueRange tensors, bool hasOutput = false,
-                                   bool isSparseOut = false);
+                                   bool isSparseOut = false,
+                                   ArrayRef<unsigned> topSort = {});
 
   /// Starts a loop emitting session by generating all the buffers needed to
   /// iterate tensors.
@@ -374,6 +375,15 @@
                                       ArrayRef<size_t> extraTids = {},
                                       ArrayRef<size_t> extraDims = {});
 
+  Operation *enterFilterLoopOverTensorAtDim(OpBuilder &builder, Location loc,
+                                            size_t tid, size_t dim,
+                                            AffineExpr affine,
+                                            MutableArrayRef<Value> reduc = {});
+
+  void genDenseAffineAddressAtCurLevel(OpBuilder &builder, Location loc,
+                                       size_t tid, size_t dim,
+                                       AffineExpr affine);
+
   /// Emits a co-iteration loop over a set of tensors.
   Operation *enterCoIterationOverTensorsAtDims(
       OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
@@ -424,6 +434,8 @@
     const Value iv;        // the induction variable for the loop
   };
 
+  Value genAffine(OpBuilder &builder, AffineExpr a, Location loc);
+
   /// Linearizes address for dense dimension (i.e., p = (i * d0) + j).
   Value genAddress(OpBuilder &builder, Location loc, size_t tid, size_t dim,
                    Value iv) {
@@ -506,6 +518,8 @@
   // sequence.
   std::vector<Value> loopSeqStack;
 
+  std::vector<unsigned> sparsiferLoopLvlMap;
+
   // TODO: not yet used, it should track the current level for each tensor
   // to help eliminate `dim` paramters from above APIs.
   // std::vector<size_t> curLv;
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -96,12 +96,14 @@
 
 SparseTensorLoopEmitter::SparseTensorLoopEmitter(ValueRange tensors,
                                                  bool hasOutput,
-                                                 bool isSparseOut)
+                                                 bool isSparseOut,
+                                                 ArrayRef<unsigned> topSort)
     : hasOutput(hasOutput), isSparseOut(isSparseOut),
       tensors(tensors.begin(), tensors.end()), dimTypes(tensors.size()),
       pidxs(tensors.size()), coord(tensors.size()), highs(tensors.size()),
       ptrBuffer(tensors.size()), idxBuffer(tensors.size()),
-      valBuffer(tensors.size()), loopStack() {
+      valBuffer(tensors.size()), loopStack(),
+      sparsiferLoopLvlMap(topSort.size(), 0) {
   for (size_t tid = 0, e = tensors.size(); tid < e; tid++) {
     auto t = tensors[tid];
     // a scalar or 0-dimension tensors
@@ -125,6 +127,13 @@
     ptrBuffer[tid].assign(rank, Value());
     idxBuffer[tid].assign(rank, Value());
   }
+
+  for (unsigned i = 0, e = topSort.size(); i < e; i++) {
+    // This is an inverse map of the topologically sorted loop index from
+    // sparsifier. This is needed to map the AffineDimExpr back to the loopStack
+    // index used in loop emitter.
+    sparsiferLoopLvlMap[topSort[i]] = i;
+  }
 }
 
 void SparseTensorLoopEmitter::initializeLoopEmit(
@@ -291,6 +300,106 @@
   return loop;
 }
 
+Value SparseTensorLoopEmitter::genAffine(OpBuilder &builder, AffineExpr a,
+                                         Location loc) {
+  switch (a.getKind()) {
+  case AffineExprKind::DimId: {
+    unsigned idx = a.cast<AffineDimExpr>().getPosition();
+    return loopStack[sparsiferLoopLvlMap[idx]].iv; // universal dense index
+  }
+  case AffineExprKind::Add: {
+    auto binOp = a.cast<AffineBinaryOpExpr>();
+    return builder.create<arith::AddIOp>(
+        loc, genAffine(builder, binOp.getLHS(), loc),
+        genAffine(builder, binOp.getRHS(), loc));
+  }
+  case AffineExprKind::Mul: {
+    auto binOp = a.cast<AffineBinaryOpExpr>();
+    return builder.create<arith::MulIOp>(
+        loc, genAffine(builder, binOp.getLHS(), loc),
+        genAffine(builder, binOp.getRHS(), loc));
+  }
+  case AffineExprKind::Constant: {
+    int64_t c = a.cast<AffineConstantExpr>().getValue();
+    return constantIndex(builder, loc, c);
+  }
+  default:
+    llvm_unreachable("unexpected affine subscript");
+  }
+}
+
+Operation *SparseTensorLoopEmitter::enterFilterLoopOverTensorAtDim(
+    OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine,
+    MutableArrayRef<Value> reduc) {
+  assert(!affine.isa<AffineDimExpr>() && !isDenseDLT(dimTypes[tid][dim]));
+  assert(dimTypes[tid].size() > dim);
+  // We can not re-enter the same level.
+  assert(!coord[tid][dim]);
+
+  Value step = constantIndex(builder, loc, 1);
+
+  Value lo = pidxs[tid][dim];
+  Value hi = highs[tid][dim];
+
+  // TODO: we should instead use a whileOp for filter loop to allow early
+  // break when exceeding (for ordered dimensions).
+  scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
+
+  // In-place update on the reduction variable vector.
+  assert(forOp.getNumRegionIterArgs() == reduc.size());
+  for (int i = 0, e = reduc.size(); i < e; i++)
+    reduc[i] = forOp.getRegionIterArg(i);
+
+  builder.setInsertionPointToStart(forOp.getBody());
+  Value iv = forOp.getInductionVar();
+
+  pidxs[tid][dim] = iv;
+  // Generating a load on the indices array yields the coordinate.
+  Value ptr = idxBuffer[tid][dim];
+  coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv);
+
+  // Generate a if condition to filter out indices that is not equal to the
+  // result of the affine expression.
+  Value expected = genAffine(builder, affine, loc);
+  auto pred = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
+                                            coord[tid][dim], expected);
+  SmallVector<Type> types;
+  for (Value red : reduc) {
+    types.push_back(red.getType());
+  }
+
+  bool hasReduc = !types.empty();
+  scf::IfOp ifOp =
+      builder.create<scf::IfOp>(loc, types, pred, /*else*/ hasReduc);
+  if (hasReduc) {
+    // scf.for (a) -> v
+    //  %s = scf.if (a) -> v
+    //    user-generated code.
+    //  else
+    //    yield a
+    //  yield %s
+    builder.create<scf::YieldOp>(loc, ifOp.getResults());
+    builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
+    // On mismatch.
+    builder.create<scf::YieldOp>(loc, reduc);
+  }
+  // Set the insert point to matched branch.
+  builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+
+  // NOTE: we can also prepares for next dim here in advance
+  // Push the loop into stack
+  loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), forOp,
+                         coord[tid][dim]);
+  return forOp;
+}
+
+void SparseTensorLoopEmitter::genDenseAffineAddressAtCurLevel(
+    OpBuilder &builder, Location loc, size_t tid, size_t dim,
+    AffineExpr affine) {
+  Value affineV = genAffine(builder, affine, loc);
+  pidxs[tid][dim] = genAddress(builder, loc, tid, dim, affineV);
+}
+
 Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims(
     OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
     ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc,
@@ -469,7 +578,6 @@
   if (forOp) {
     if (!reduc.empty()) {
       assert(reduc.size() == forOp.getNumResults());
-      rewriter.setInsertionPointToEnd(forOp.getBody());
       rewriter.create<scf::YieldOp>(loc, reduc);
     }
     // Exit the loop.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -57,7 +57,7 @@
           unsigned numLoops, OpOperand *op, unsigned nest,
           std::vector<unsigned> &ts)
       : options(o), loopEmitter(tensors, /*isLastOutput=*/true,
-                                /*isSparseOut=*/op != nullptr),
+                                /*isSparseOut=*/op != nullptr, ts),
         sparseOut(op), outerParNest(nest), topSort(ts) {
     if (op)
       insChain = op->get();
@@ -105,7 +105,7 @@
 static AffineMap permute(MLIRContext *context, AffineMap m,
                          std::vector<unsigned> &topSort) {
   unsigned sz = topSort.size();
-  assert(m.getNumResults() == sz && "TopoSort/AffineMap size mismatch");
+  assert(m.getNumDims() == sz && "TopoSort/AffineMap size mismatch");
   // Construct the inverse of `m`; to avoid the asymptotic complexity
   // of calling `m.getPermutedPosition` repeatedly.
   SmallVector<unsigned, 4> inv(sz);
@@ -122,7 +122,7 @@
 /// same index is used more than once. Also rejects compound affine
 /// expressions in sparse dimensions.
 static bool findAffine(Merger &merger, unsigned tensor, unsigned dim,
-                       AffineExpr a, DimLevelType dlt,
+                       AffineExpr a, DimLevelType dlt, unsigned &filterLdx,
                        bool setLvlFormat = true) {
   switch (a.getKind()) {
   case AffineExprKind::DimId: {
@@ -135,22 +135,58 @@
     return true;
   }
   case AffineExprKind::Add:
-  case AffineExprKind::Mul: {
-    if (!isDenseDLT(dlt))
-      return false; // compound only in dense dim
-    auto binOp = a.cast<AffineBinaryOpExpr>();
-    // We do not set dim level format for affine expresssion like d0 + d1 on
-    // both loop index at d0 and d1,
-    return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, false) &&
-           findAffine(merger, tensor, dim, binOp.getRHS(), dlt, false);
+  case AffineExprKind::Mul:
+  case AffineExprKind::Constant: {
+    if (!isDenseDLT(dlt) && setLvlFormat) {
+      assert(isUndefDLT(merger.getDimLevelType(tensor, filterLdx)));
+      // Use a filter loop for sparse affine expression.
+      merger.setDimAndDimLevelType(tensor, filterLdx++, dim, dlt);
+    }
+
+    if (auto binOp = a.dyn_cast<AffineBinaryOpExpr>()) {
+      // We do not set dim level format for affine expresssion like d0 + d1 on
+      // either loop index at d0 or d1/
+      // We continues the recursion merely to check whether current affine is
+      // admissible or not.
+      return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, filterLdx,
+                        false) &&
+             findAffine(merger, tensor, dim, binOp.getRHS(), dlt, filterLdx,
+                        false);
+    }
+    // Falls through when it is a constant Affine
+    return true;
   }
-  case AffineExprKind::Constant:
-    return isDenseDLT(dlt); // const only in dense dim
   default:
     return false;
   }
 }
 
+static unsigned getNumCompoundAffineOnSparseDims(AffineMap affineMap,
+                                                 Value tensor) {
+  unsigned num = 0;
+  auto enc = getSparseTensorEncoding(tensor.getType());
+  if (enc) {
+    ArrayRef<AffineExpr> exps = affineMap.getResults();
+    for (unsigned rank = 0; rank < exps.size(); rank++) {
+      auto aidx = toOrigDim(enc, rank);
+      auto affine = exps[aidx];
+      if (!affine.isa<AffineDimExpr>())
+        if (!isDenseDLT(getDimLevelType(enc, rank)))
+          num++;
+    }
+  }
+
+  return num;
+}
+
+static unsigned getNumCompoundAffineOnSparseDims(linalg::GenericOp op) {
+  unsigned num = 0;
+  for (OpOperand &t : op->getOpOperands())
+    num += getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(&t),
+                                            t.get());
+  return num;
+}
+
 /// Helper method to inspect sparse encodings in the tensor types.
 /// Fills the per-dimension sparsity information for all tensors.
 /// Returns true if the sparse annotations and affine subscript
@@ -158,19 +194,22 @@
 /// no annotations are found or inadmissible constructs occur.
 static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) {
   bool annotated = false;
+  unsigned filterLdx = merger.getFilterLoopStartingIdx();
   for (OpOperand &t : op->getOpOperands()) {
     auto map = op.getMatchingIndexingMap(&t);
     auto enc = getSparseTensorEncoding(t.get().getType());
     if (enc)
       annotated = true;
     assert(map.getNumResults() == op.getRank(&t));
+
     for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
       unsigned tensor = t.getOperandNumber();
       AffineExpr a = map.getResult(toOrigDim(enc, d));
-      if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d)))
+      if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d), filterLdx))
         return false; // inadmissible affine expression
     }
   }
+  assert(filterLdx == merger.getNumLoops());
   return annotated;
 }
 
@@ -179,14 +218,15 @@
 /// The sorted result will put the first Reduction iterator to the
 /// latest possible index.
 static bool topSortOptimal(unsigned n, ArrayRef<StringRef> iteratorTypes,
-                           std::vector<unsigned> &topSort,
+                           const Merger &merger, std::vector<unsigned> &topSort,
                            std::vector<unsigned> &inDegree,
                            std::vector<std::vector<bool>> &adjM) {
   std::vector<unsigned> redIt; // reduce iterator with 0 degree
   std::vector<unsigned> parIt; // parallel iterator with 0 degree
   for (unsigned i = 0; i < n; i++) {
     if (inDegree[i] == 0) {
-      if (linalg::isReductionIterator(iteratorTypes[i]))
+      if (!merger.isFilterLoop(i) &&
+          linalg::isReductionIterator(iteratorTypes[i]))
         redIt.push_back(i);
       else
         parIt.push_back(i);
@@ -202,7 +242,8 @@
     // Update in-degree, and push 0-degree node into worklist.
     for (unsigned dst = 0; dst < n; dst++)
       if (adjM[src][dst] && --inDegree[dst] == 0) {
-        if (linalg::isReductionIterator(iteratorTypes[dst]))
+        if (!merger.isFilterLoop(dst) &&
+            linalg::isReductionIterator(iteratorTypes[dst]))
           redIt.push_back(dst);
         else
           parIt.push_back(dst);
@@ -216,23 +257,38 @@
 /// example i0+i1 < i2+i3+1 yields i0<i2, i0<i3, i1<i2, and i1<i3.
 static void addAffineOrderings(std::vector<std::vector<bool>> &adjM,
                                std::vector<unsigned> &inDegree, AffineExpr a,
-                               AffineExpr b, unsigned fidx) {
-  switch (a.getKind()) {
-  case AffineExprKind::DimId: {
-    unsigned idx = a.cast<AffineDimExpr>().getPosition();
-    if (b)
-      addAffineOrderings(adjM, inDegree, b, AffineExpr(), idx);
-    else if (!adjM[fidx][idx]) {
-      adjM[fidx][idx] = true;
-      inDegree[idx]++;
+                               AffineExpr b, Optional<unsigned> fidx,
+                               Optional<unsigned> tidx) {
+  if (!a && !b) {
+    // Recursion leaf.
+    assert(fidx && tidx);
+    unsigned f = *fidx, t = *tidx;
+    if (!adjM[f][t]) {
+      adjM[f][t] = true;
+      inDegree[t]++;
     }
+    return;
+  }
+  auto toExpand = a ? a : b;
+  switch (toExpand.getKind()) {
+  case AffineExprKind::DimId: {
+    auto idx = toExpand.cast<AffineDimExpr>().getPosition();
+    if (toExpand == a)
+      addAffineOrderings(adjM, inDegree, AffineExpr(), b, idx, tidx);
+    else // toExpand == b
+      addAffineOrderings(adjM, inDegree, a, AffineExpr(), fidx, idx);
     break;
   }
   case AffineExprKind::Add:
   case AffineExprKind::Mul: {
-    auto binOp = a.cast<AffineBinaryOpExpr>();
-    addAffineOrderings(adjM, inDegree, binOp.getLHS(), b, fidx);
-    addAffineOrderings(adjM, inDegree, binOp.getRHS(), b, fidx);
+    auto binOp = toExpand.cast<AffineBinaryOpExpr>();
+    if (toExpand == a) {
+      addAffineOrderings(adjM, inDegree, binOp.getLHS(), b, fidx, tidx);
+      addAffineOrderings(adjM, inDegree, binOp.getRHS(), b, fidx, tidx);
+    } else {
+      addAffineOrderings(adjM, inDegree, a, binOp.getLHS(), fidx, tidx);
+      addAffineOrderings(adjM, inDegree, a, binOp.getRHS(), fidx, tidx);
+    }
     break;
   }
   default:
@@ -250,7 +306,7 @@
                                   OpOperand *skip = nullptr) {
   // Set up an n x n from/to adjacency matrix of the iteration graph
   // for the implicit loop indices i_0 .. i_n-1.
-  unsigned n = op.getNumLoops();
+  unsigned n = merger.getNumLoops();
   std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));
   std::vector<unsigned> inDegree(n, 0); // in-degree of each node.
   auto iteratorTypes = op.getIteratorTypesArray();
@@ -262,7 +318,7 @@
     // Get map and encoding.
     auto map = op.getMatchingIndexingMap(&t);
     auto enc = getSparseTensorEncoding(t.get().getType());
-    assert(map.getNumDims() == n);
+    assert(map.getNumDims() + getNumCompoundAffineOnSparseDims(op) == n);
     // Skip dense tensor constraints when not requested.
     if (!(mask & SortMask::kIncludeDense) && !enc)
       continue;
@@ -270,10 +326,42 @@
     // by default) puts an ordering constraint on the loop indices. For
     // example, the tensor expresion A_ijk forces the ordering i < j < k
     // on the loop indices if no explicit dimension ordering is given.
-    for (unsigned d = 1, rank = map.getNumResults(); d < rank; d++) {
-      AffineExpr f = map.getResult(toOrigDim(enc, d - 1));
-      AffineExpr t = map.getResult(toOrigDim(enc, d));
-      addAffineOrderings(adjM, inDegree, f, t, 0);
+    for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
+      AffineExpr ta = map.getResult(toOrigDim(enc, d));
+      Optional<unsigned> tldx = merger.getLoopIdx(t.getOperandNumber(), d);
+      if (tldx && merger.isFilterLoop(tldx.value())) {
+        assert(!ta.isa<AffineDimExpr>() &&
+               !isDenseDLT(getDimLevelType(enc, d)));
+        addAffineOrderings(adjM, inDegree, ta, AffineExpr(), llvm::None, tldx);
+        // Now that the ordering of affine expression is captured by filter
+        // loop idx, we only need to ensure the affine ordering against filter
+        // loop. Thus, we reset the affine express to nil here to mark it as
+        // resolved.
+        ta = AffineExpr();
+      }
+
+      if (d > 0) {
+        AffineExpr fa = map.getResult(toOrigDim(enc, d - 1));
+        Optional<unsigned> fldx =
+            merger.getLoopIdx(t.getOperandNumber(), d - 1);
+
+        // Filter loops should be constructed after all the dependent loops,
+        // i.e., d0 + d1 < filter_loop(d0 + d1)
+        if (fldx && merger.isFilterLoop(fldx.value())) {
+          // This must be a compound affine expression on sparse dimension.
+          assert(!fa.isa<AffineDimExpr>() &&
+                 !isDenseDLT(getDimLevelType(enc, d - 1)));
+          // For the same reason above.
+          fa = AffineExpr();
+        }
+
+        // (d0 + d1) < (d2 + d3), or
+        // filter_loop_d-1 < (d2 + d3), or
+        // (d0 + d1) < filter_loop_d, or
+        // filter_loop_d-1 < filter_loop_d depending on whether fa/ta is reset
+        // above.
+        addAffineOrderings(adjM, inDegree, fa, ta, fldx, tldx);
+      }
     }
     // Push unrelated loops into sparse iteration space, so these
     // will be skipped more often.
@@ -297,7 +385,7 @@
   // Report failure for a cyclic iteration graph.
   topSort.clear();
   topSort.reserve(n);
-  return topSortOptimal(n, iteratorTypes, topSort, inDegree, adjM);
+  return topSortOptimal(n, iteratorTypes, merger, topSort, inDegree, adjM);
 }
 
 /// Returns true if tensor materializes uninitialized into the computation.
@@ -325,9 +413,8 @@
   // An all-dense annotated "sparse" output tensor becomes a linearized random
   // access 1-dim memref. Also admissible since insertions cannot occur.
   bool allDense = true;
-  auto iteratorTypes = op.getIteratorTypesArray();
-  unsigned numLoops = iteratorTypes.size();
-  for (unsigned i = 0; i < numLoops; i++)
+  unsigned numLoops = merger.getNumLoops(); // numNativeLoops + numFilterLoops
+  for (unsigned i = 0; i < merger.getNumLoops(); i++)
     if (isCompressedDLT(merger.getDimLevelType(tensor, i)) ||
         isSingletonDLT(merger.getDimLevelType(tensor, i))) {
       allDense = false;
@@ -338,19 +425,31 @@
     }
   if (allDense)
     return true;
+
+  // TODO: support compound affine expression on sparse output.
+  if (getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(lhs),
+                                       lhs->get()) != 0)
+    return false;
+
   // A tensor expression with a sparse output tensor that changes its values
   // but not its nonzero structure, an operation called "simply dynamic" in
   // [Bik96,Ch9], is also admissible without special codegen.
   if (merger.isSingleCondition(tensor, exp))
     return true;
+
   // Accept "truly dynamic" if the output tensor materializes uninitialized
   // into the computation and insertions occur in lexicographic index order.
   if (isMaterializing(lhs->get())) {
+    auto iteratorTypes = op.getIteratorTypesArray();
     unsigned nest = 0;
     for (unsigned i = 0; i < numLoops; i++) {
-      if (linalg::isReductionIterator(iteratorTypes[topSort[i]]))
-        break; // terminate at first reduction
-      nest++;
+      if (!merger.isFilterLoop(topSort[i])) {
+        // We only count non-filter loops as filter loops should be considered
+        // as a special type of parallel loops.
+        if (linalg::isReductionIterator(iteratorTypes[topSort[i]]))
+          break; // terminate at first reduction
+        nest++;
+      }
     }
     // Determine admissible dynamic insertion situations:
     // (1) fully injective, since there are no reductions,
@@ -446,12 +545,12 @@
   codegen.loopEmitter.initializeLoopEmit(
       builder, loc,
       /// Generates buffer for the output tensor.
-      /// Note that all sparse kernels assume that when all elements are written
-      /// to (viz. x(i) = y(i) * z(i)), the output buffer is already initialized
-      /// to all zeroes and only nonzeroes values are computed and written out.
-      /// For updates (viz. x(i) += y(i) * z(i)), only nonzeroes values are used
-      /// for the updates and no assumption on the original contents of the
-      /// output buffer is necessary.
+      /// Note that all sparse kernels assume that when all elements are
+      /// written to (viz. x(i) = y(i) * z(i)), the output buffer is already
+      /// initialized to all zeroes and only nonzeroes values are computed and
+      /// written out. For updates (viz. x(i) += y(i) * z(i)), only nonzeroes
+      /// values are used for the updates and no assumption on the original
+      /// contents of the output buffer is necessary.
       [&op](OpBuilder &builder, Location loc, Value memref,
             Value tensor) -> Value {
         // Must not be a sparse tensor.
@@ -460,13 +559,13 @@
         // Two output tensors references should pointed to the same object.
         assert(lhs->get() == tensor);
         bool isInit = op.isInitTensor(lhs);
-        // An output tensor can simply materialize from the buffer of the tensor
-        // that appears in the outs() clause. For updates, this has the
+        // An output tensor can simply materialize from the buffer of the
+        // tensor that appears in the outs() clause. For updates, this has the
         // advantage that only the nonzero value are involved in the
-        // computation, keeping the operation O(nnz). In all other cases, we are
-        // forced to zero out the buffer to enforce the assumption above, which
-        // may negatively impact running complexity (viz. O(n^2 + nnz) vs.
-        // O(nnz) for matrices).
+        // computation, keeping the operation O(nnz). In all other cases, we
+        // are forced to zero out the buffer to enforce the assumption above,
+        // which may negatively impact running complexity (viz. O(n^2 + nnz)
+        // vs. O(nnz) for matrices).
         // TODO: use better analysis to avoid zeroing out the buffer?
         Value init = memref;
         if (!isInit) {
@@ -531,9 +630,6 @@
   unsigned rank = map.getNumResults();
   if (enc) {
     // Note that currently, all sparse subscripts are simple.
-    // TODO: accept affine too?
-    assert(map.getResult(toOrigDim(enc, rank - 1)).getKind() ==
-           AffineExprKind::DimId);
     Value pidx = codegen.loopEmitter.getPidxs()[tensor].back();
     assert(pidx);
     args.push_back(pidx); // position index
@@ -778,8 +874,11 @@
   switch (a.getKind()) {
   case AffineExprKind::DimId: {
     unsigned idx = a.cast<AffineDimExpr>().getPosition();
-    if (idx == ldx)
+    if (idx == ldx) {
       atLevel = true;
+      // Must be invariant if we are at the level.
+      return true;
+    }
     return codegen.getLoopIdxValue(idx) != nullptr; // no longer in play?
   }
   case AffineExprKind::Add:
@@ -807,7 +906,14 @@
     auto enc = getSparseTensorEncoding(t.get().getType());
     for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
       AffineExpr a = map.getResult(toOrigDim(enc, d));
-      if (!isInvariantAffine(codegen, a, ldx, atLevel))
+      Optional<unsigned> sldx = merger.getLoopIdx(t.getOperandNumber(), d);
+      if (sldx && merger.isFilterLoop(sldx.value())) {
+        if (!codegen.getLoopIdxValue(sldx.value()))
+          // The filter loops has not been constructed.
+          return;
+        if (sldx.value() == ldx)
+          atLevel = true;
+      } else if (!isInvariantAffine(codegen, a, ldx, atLevel))
         return; // still in play
     }
     // All exhausted at this level (atLevel denotes exactly at this level).
@@ -924,13 +1030,22 @@
                          ArrayRef<size_t> extraTids,
                          ArrayRef<size_t> extraDims) {
   Location loc = op.getLoc();
-  auto iteratorTypes = op.getIteratorTypesArray();
   bool isSparse = isCompressedDLT(merger.getDimLevelType(tid, idx)) ||
                   isSingletonDLT(merger.getDimLevelType(tid, idx));
   bool isParallel = isParallelFor(codegen, isOuter, isSparse);
 
   Operation *loop =
       genReducUpdateStmt(codegen, merger, [&](MutableArrayRef<Value> reduc) {
+        if (merger.isFilterLoop(idx)) {
+          assert(isSparse);
+          OpOperand *t = &op->getOpOperand(tid);
+          auto enc = getSparseTensorEncoding(t->get().getType());
+          // Retrieves the affine expression for the filter loop.
+          AffineExpr a =
+              op.getMatchingIndexingMap(t).getResult(toOrigDim(enc, dim));
+          return codegen.loopEmitter.enterFilterLoopOverTensorAtDim(
+              builder, loc, tid, dim, a, reduc);
+        }
         return codegen.loopEmitter.enterLoopOverTensorAtDim(
             builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims);
       }).value();
@@ -1116,12 +1231,13 @@
   return false;
 }
 
-static void translateBitsToTidDimPairs(Merger &merger, CodeGen &codegen,
-                                       unsigned li, unsigned idx,
-                                       SmallVectorImpl<size_t> &condTids,
-                                       SmallVectorImpl<size_t> &condDims,
-                                       SmallVectorImpl<size_t> &extraTids,
-                                       SmallVectorImpl<size_t> &extraDims) {
+static void translateBitsToTidDimPairs(
+    Merger &merger, CodeGen &codegen, linalg::GenericOp op, unsigned li,
+    unsigned idx, SmallVectorImpl<size_t> &condTids,
+    SmallVectorImpl<size_t> &condDims, SmallVectorImpl<size_t> &extraTids,
+    SmallVectorImpl<size_t> &extraDims, SmallVectorImpl<size_t> &affineTids,
+    SmallVectorImpl<size_t> &affineDims, SmallVectorImpl<AffineExpr> &exps) {
+
   const BitVector &all = merger.lat(li).bits;
   const BitVector &simple = merger.lat(li).simple;
 
@@ -1149,6 +1265,38 @@
       // TODO: get rid of extraTids and extraDims.
       extraTids.push_back(tid);
       extraDims.push_back(dim.value());
+    } else {
+      assert(isUndefDLT(dlt));
+      if (tid >= op.getNumDpsInputs())
+        // We only handle affine expression on input tensors (for now).
+        return;
+      OpOperand *operand = &op->getOpOperand(tid);
+      auto enc = getSparseTensorEncoding(operand->get().getType());
+      if (!enc)
+        // Non-annotated dense tensors requires no special handling.
+        return;
+
+      ArrayRef<AffineExpr> affines =
+          op.getMatchingIndexingMap(operand).getResults();
+      assert(affines.size() == enc.getDimLevelType().size());
+      for (unsigned i = 0, e = affines.size(); i < e; i++) {
+        AffineExpr exp = affines[toOrigDim(enc, i)];
+        if (exp.isa<AffineDimExpr>() || !isDenseDLT(getDimLevelType(enc, i)))
+          // Skip simple affine expression and non dense dimensions (which has
+          // it own filter loop).
+          continue;
+
+        bool atLevel = false;
+        if (isInvariantAffine(codegen, exp, idx, atLevel) && atLevel) {
+          // If the compound affine is invariant and we are right at the
+          // level. We need to generate the address according to the affine
+          // expression. This is also the best place we can do it to avoid
+          // putting it inside inner loops.
+          affineTids.push_back(tid);
+          affineDims.push_back(i);
+          exps.push_back(exp);
+        }
+      }
     }
   });
 
@@ -1156,7 +1304,6 @@
     // Note that we generate dense indices of the output tensor
     // unconditionally, since they may not appear in the lattice, but may be
     // needed for linearized codegen.
-    // Only dense dimensions should be optimized from conditions.
     auto dim = merger.getDimNum(merger.getOutTensorID(), idx).value();
     extraTids.push_back(merger.getOutTensorID());
     extraDims.push_back(dim);
@@ -1173,11 +1320,20 @@
   // need extra locals to iterate on them.
   SmallVector<size_t, 4> extraTids, extraDims;
 
-  translateBitsToTidDimPairs(merger, codegen, li, codegen.topSort[at], condTids,
-                             condDims, extraTids, extraDims);
+  SmallVector<size_t, 4> affineTids, affineDims;
+  SmallVector<AffineExpr, 4> affines;
+
+  translateBitsToTidDimPairs(merger, codegen, op, li, codegen.topSort[at],
+                             condTids, condDims, extraTids, extraDims,
+                             affineTids, affineDims, affines);
   // Emit the for/while-loop control.
   Operation *loop = genLoop(merger, codegen, builder, op, at, needsUniv,
                             condTids, condDims, extraTids, extraDims);
+
+  for (auto [tid, dim, exp] : llvm::zip(affineTids, affineDims, affines)) {
+    codegen.loopEmitter.genDenseAffineAddressAtCurLevel(builder, op.getLoc(),
+                                                        tid, dim, exp);
+  }
   return loop;
 }
 
@@ -1320,7 +1476,8 @@
       return failure();
     unsigned numTensors = op->getNumOperands();
     unsigned numLoops = op.getNumLoops();
-    Merger merger(numTensors, numLoops);
+    unsigned numFilterLoops = getNumCompoundAffineOnSparseDims(op);
+    Merger merger(numTensors, numLoops, numFilterLoops);
     if (!findSparseAnnotations(merger, op))
       return failure();
 
diff --git a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
@@ -54,6 +54,57 @@
   return %0 : tensor<32xf32>
 }
 
+// CHECK-LABEL:   func.func @mul_inv_sparse1d(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>,
+// CHECK-SAME:                                %[[VAL_1:.*]]: tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>>)
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 3 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_6:.*]] = bufferization.alloc_tensor() : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf32>
+// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 0 : index} : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf32>
+// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_6]]) -> (tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:             %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK:             %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_4]] : index
+// CHECK:             %[[VAL_19:.*]] = scf.if %[[VAL_18]] -> (tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:               %[[VAL_20:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_15]]] : memref<?xf32>
+// CHECK:               %[[VAL_21:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK:               %[[VAL_22:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK:               %[[VAL_23:.*]] = scf.for %[[VAL_24:.*]] = %[[VAL_21]] to %[[VAL_22]] step %[[VAL_3]] iter_args(%[[VAL_25:.*]] = %[[VAL_16]]) -> (tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:                 %[[VAL_26:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_24]]] : memref<?xf32>
+// CHECK:                 %[[VAL_27:.*]] = arith.mulf %[[VAL_26]], %[[VAL_20]] : f32
+// CHECK:                 %[[VAL_28:.*]] = arith.addf %[[VAL_27]], %[[VAL_5]] : f32
+// CHECK:                 %[[VAL_29:.*]] = sparse_tensor.insert %[[VAL_28]] into %[[VAL_25]]{{\[}}%[[VAL_17]]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:                 scf.yield %[[VAL_29]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:               }
+// CHECK:               scf.yield %[[VAL_30:.*]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:             } else {
+// CHECK:               scf.yield %[[VAL_16]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:             }
+// CHECK:             scf.yield %[[VAL_31:.*]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           }
+// CHECK:           %[[VAL_32:.*]] = sparse_tensor.load %[[VAL_33:.*]] hasInserts : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           return %[[VAL_32]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+func.func @mul_inv_sparse1d(%arga: tensor<32xf32, #SpVec>,
+                            %argb: tensor<4xf32, #SpVec>) -> tensor<32xf32, #SpVec> {
+  %argx = bufferization.alloc_tensor() : tensor<32xf32, #SpVec>
+  %0 = linalg.generic #trait1
+     ins(%arga, %argb: tensor<32xf32, #SpVec>, tensor<4xf32, #SpVec>)
+    outs(%argx: tensor<32xf32, #SpVec>) {
+      ^bb(%a: f32, %b: f32, %x: f32):
+        %0 = arith.mulf %a, %b : f32
+        %1 = arith.addf %x, %0 : f32
+        linalg.yield %1 : f32
+  } -> tensor<32xf32, #SpVec>
+  return %0 : tensor<32xf32, #SpVec>
+}
+
 #trait2 = {
   indexing_maps = [
     affine_map<(i) -> (i)>,  // a
@@ -104,6 +155,58 @@
   return %0 : tensor<32xi32>
 }
 
+// CHECK-LABEL:   func.func @and_affine_sparse1d(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>>)
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_5:.*]] = bufferization.alloc_tensor() : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xi32>
+// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 0 : index} : tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xi32>
+// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_5]]) -> (tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:             %[[VAL_17:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK:             %[[VAL_18:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_15]]] : memref<?xi32>
+// CHECK:             %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK:             %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK:             %[[VAL_21:.*]] = scf.for %[[VAL_22:.*]] = %[[VAL_19]] to %[[VAL_20]] step %[[VAL_3]] iter_args(%[[VAL_23:.*]] = %[[VAL_16]]) -> (tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:               %[[VAL_24:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_22]]] : memref<?xindex>
+// CHECK:               %[[VAL_25:.*]] = arith.addi %[[VAL_17]], %[[VAL_4]] : index
+// CHECK:               %[[VAL_26:.*]] = arith.cmpi eq, %[[VAL_24]], %[[VAL_25]] : index
+// CHECK:               %[[VAL_27:.*]] = scf.if %[[VAL_26]] -> (tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:                 %[[VAL_28:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_22]]] : memref<?xi32>
+// CHECK:                 %[[VAL_29:.*]] = arith.andi %[[VAL_18]], %[[VAL_28]] : i32
+// CHECK:                 %[[VAL_30:.*]] = sparse_tensor.insert %[[VAL_29]] into %[[VAL_23]]{{\[}}%[[VAL_17]]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:                 scf.yield %[[VAL_30]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:               } else {
+// CHECK:                 scf.yield %[[VAL_23]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:               }
+// CHECK:               scf.yield %[[VAL_31:.*]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:             }
+// CHECK:             scf.yield %[[VAL_32:.*]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           }
+// CHECK:           %[[VAL_33:.*]] = sparse_tensor.load %[[VAL_34:.*]] hasInserts : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           return %[[VAL_33]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+func.func @and_affine_sparse1d(%arga: tensor<32xi32, #SpVec>,
+                               %argb: tensor<34xi32, #SpVec>) -> tensor<32xi32, #SpVec> {
+  %argx = bufferization.alloc_tensor() : tensor<32xi32, #SpVec>
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32xi32, #SpVec>, tensor<34xi32, #SpVec>)
+    outs(%argx: tensor<32xi32, #SpVec>) {
+      ^bb(%a: i32, %b: i32, %x: i32):
+        %0 = arith.andi %a, %b : i32
+        linalg.yield %0 : i32
+  } -> tensor<32xi32, #SpVec>
+  return %0 : tensor<32xi32, #SpVec>
+}
+
+
 #trait3 = {
   indexing_maps = [
     affine_map<(i,j) -> (i,j)>,      // a
@@ -160,3 +263,65 @@
   } -> tensor<32x16xf64>
   return %0 : tensor<32x16xf64>
 }
+
+// CHECK-LABEL:   func.func @mul_affine_sparse2d(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 32 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           %[[VAL_7:.*]] = arith.constant 3 : index
+// CHECK:           %[[VAL_8:.*]] = bufferization.alloc_tensor() : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
+// CHECK:           %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 1 : index} : tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 1 : index} : tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
+// CHECK:           %[[VAL_15:.*]] = scf.for %[[VAL_16:.*]] = %[[VAL_3]] to %[[VAL_2]] step %[[VAL_4]] iter_args(%[[VAL_17:.*]] = %[[VAL_8]]) -> (tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:             %[[VAL_18:.*]] = arith.addi %[[VAL_16]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_16]]] : memref<?xindex>
+// CHECK:             %[[VAL_20:.*]] = arith.addi %[[VAL_16]], %[[VAL_4]] : index
+// CHECK:             %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_20]]] : memref<?xindex>
+// CHECK:             %[[VAL_22:.*]] = scf.for %[[VAL_23:.*]] = %[[VAL_19]] to %[[VAL_21]] step %[[VAL_4]] iter_args(%[[VAL_24:.*]] = %[[VAL_17]]) -> (tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:               %[[VAL_25:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_23]]] : memref<?xindex>
+// CHECK:               %[[VAL_26:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_23]]] : memref<?xf64>
+// CHECK:               %[[VAL_27:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_18]]] : memref<?xindex>
+// CHECK:               %[[VAL_28:.*]] = arith.addi %[[VAL_18]], %[[VAL_4]] : index
+// CHECK:               %[[VAL_29:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:               %[[VAL_30:.*]]:2 = scf.for %[[VAL_31:.*]] = %[[VAL_27]] to %[[VAL_29]] step %[[VAL_4]] iter_args(%[[VAL_32:.*]] = %[[VAL_6]], %[[VAL_33:.*]] = %[[VAL_24]]) -> (f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:                 %[[VAL_34:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_31]]] : memref<?xindex>
+// CHECK:                 %[[VAL_35:.*]] = arith.addi %[[VAL_25]], %[[VAL_7]] : index
+// CHECK:                 %[[VAL_36:.*]] = arith.cmpi eq, %[[VAL_34]], %[[VAL_35]] : index
+// CHECK:                 %[[VAL_37:.*]]:2 = scf.if %[[VAL_36]] -> (f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK:                   %[[VAL_38:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_31]]] : memref<?xf64>
+// CHECK:                   %[[VAL_39:.*]] = arith.mulf %[[VAL_26]], %[[VAL_38]] : f64
+// CHECK:                   %[[VAL_40:.*]] = arith.addf %[[VAL_32]], %[[VAL_39]] : f64
+// CHECK:                   scf.yield %[[VAL_40]], %[[VAL_33]] : f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:                 } else {
+// CHECK:                   scf.yield %[[VAL_32]], %[[VAL_33]] : f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:                 }
+// CHECK:                 scf.yield %[[VAL_41:.*]]#0, %[[VAL_41]]#1 : f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:               }
+// CHECK:               %[[VAL_42:.*]] = sparse_tensor.insert %[[VAL_43:.*]]#0 into %[[VAL_43]]#1{{\[}}%[[VAL_16]], %[[VAL_25]]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:               scf.yield %[[VAL_42]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:             }
+// CHECK:             scf.yield %[[VAL_44:.*]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           }
+// CHECK:           %[[VAL_45:.*]] = sparse_tensor.load %[[VAL_46:.*]] hasInserts : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:           return %[[VAL_45]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+func.func @mul_affine_sparse2d(%arga: tensor<32x16xf64, #CSR>,
+                              %argb: tensor<34x19xf64, #CSR>) -> tensor<32x16xf64, #CSR> {
+  %argx = bufferization.alloc_tensor() : tensor<32x16xf64, #CSR>
+  %0 = linalg.generic #trait3
+     ins(%arga, %argb: tensor<32x16xf64, #CSR>, tensor<34x19xf64, #CSR>)
+    outs(%argx: tensor<32x16xf64, #CSR>) {
+      ^bb(%a: f64, %b: f64, %x: f64):
+        %0 = arith.mulf %a, %b : f64
+        %1 = arith.addf %x, %0 : f64
+        linalg.yield %1 : f64
+  } -> tensor<32x16xf64, #CSR>
+  return %0 : tensor<32x16xf64, #CSR>
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
@@ -4,6 +4,11 @@
 // RUN: FileCheck %s
 
 #DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>
+#CSR = #sparse_tensor.encoding<{dimLevelType = ["dense", "compressed"]}>
+#CSC = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "compressed" ],
+  dimOrdering = affine_map<(i,j) -> (j,i)>
+}>
 
 // An example of a 2D convolution with a sparse filter.
 module {
@@ -26,6 +31,33 @@
     return %0 : tensor<6x6xi32, #DCSR>
   }
 
+  func.func @conv2d_all_sparse_DCSR(%input:  tensor<8x8xi32, #DCSR>,
+               %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> {
+    %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR>           
+    %0 = linalg.conv_2d
+      ins  (%input, %filter: tensor<8x8xi32, #DCSR>, tensor<3x3xi32, #DCSR>)
+      outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
+    return %0 : tensor<6x6xi32, #DCSR>
+  }
+
+  func.func @conv2d_all_sparse_CSR(%input:  tensor<8x8xi32, #CSR>,
+               %filter: tensor<3x3xi32, #CSR>) -> tensor<6x6xi32, #CSR> {
+    %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSR>           
+    %0 = linalg.conv_2d
+      ins  (%input, %filter: tensor<8x8xi32, #CSR>, tensor<3x3xi32, #CSR>)
+      outs (%s: tensor<6x6xi32, #CSR>) -> tensor<6x6xi32, #CSR>
+    return %0 : tensor<6x6xi32, #CSR>
+  }
+  
+  func.func @conv2d_all_sparse_CSC(%input:  tensor<8x8xi32, #CSC>,
+               %filter: tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC> {
+    %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSC>           
+    %0 = linalg.conv_2d
+      ins  (%input, %filter: tensor<8x8xi32, #CSC>, tensor<3x3xi32, #CSC>)
+      outs (%s: tensor<6x6xi32, #CSC>) -> tensor<6x6xi32, #CSC>
+    return %0 : tensor<6x6xi32, #CSC>
+  }
+
   func.func @entry() {
     %c0 = arith.constant 0 : index
     %i0 = arith.constant 0 : i32
@@ -36,8 +68,13 @@
       [  0,  0,  0 ],
       [ -1,  0,  1 ]
     ]> : tensor<3x3xi32>
-    %sparse_filter = sparse_tensor.convert %filter
+    %sparse_filter_DCSR = sparse_tensor.convert %filter
       : tensor<3x3xi32> to tensor<3x3xi32, #DCSR>
+    %sparse_filter_CSR = sparse_tensor.convert %filter
+      : tensor<3x3xi32> to tensor<3x3xi32, #CSR>
+    %sparse_filter_CSC = sparse_tensor.convert %filter
+      : tensor<3x3xi32> to tensor<3x3xi32, #CSC>
+
 
     %input = arith.constant dense<[
       [  1,  2,  3,  4,  0,  6,  7,  8 ],
@@ -49,15 +86,31 @@
       [  1,  3,  3,  4,  3,  6,  6,  8 ],
       [  1,  3,  3,  4,  3,  0,  7,  8 ]
     ]> : tensor<8x8xi32>
-
+    %sparse_input_DCSR = sparse_tensor.convert %input
+      : tensor<8x8xi32> to tensor<8x8xi32, #DCSR>
+    %sparse_input_CSR = sparse_tensor.convert %input
+      : tensor<8x8xi32> to tensor<8x8xi32, #CSR>
+    %sparse_input_CSC = sparse_tensor.convert %input
+      : tensor<8x8xi32> to tensor<8x8xi32, #CSC>
+      
     // Call the kernel.
     %output = arith.constant dense<0> : tensor<6x6xi32>
-    %0 = call @conv2d(%input, %sparse_filter, %output)
+    %0 = call @conv2d(%input, %sparse_filter_DCSR, %output)
        : (tensor<8x8xi32>,
           tensor<3x3xi32, #DCSR>, tensor<6x6xi32>) -> tensor<6x6xi32>
-    %1 = call @conv2d_sparse_out(%input, %sparse_filter)
+    %1 = call @conv2d_sparse_out(%input, %sparse_filter_DCSR)
        : (tensor<8x8xi32>,
           tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
+    %2 = call @conv2d_all_sparse_DCSR(%sparse_input_DCSR, %sparse_filter_DCSR)
+       : (tensor<8x8xi32, #DCSR>,
+          tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
+    %3 = call @conv2d_all_sparse_CSR(%sparse_input_CSR, %sparse_filter_CSR)
+       : (tensor<8x8xi32, #CSR>,
+          tensor<3x3xi32, #CSR>) -> tensor<6x6xi32, #CSR>
+    %4 = call @conv2d_all_sparse_CSC(%sparse_input_CSC, %sparse_filter_CSC)
+       : (tensor<8x8xi32, #CSC>,
+          tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC>
+          
  
     // Verify the output.
     //
@@ -86,10 +139,65 @@
     %v1 = vector.transfer_read %sparse_ret[%c0, %c0], %i0
       : tensor<6x6xi32>, vector<6x6xi32>
     vector.print %v1 : vector<6x6xi32>
-
+    
+    //
+    // Should be the same as dense output
+    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
+    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
+    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
+    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
+    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    //
+    %all_sparse_DCSR = sparse_tensor.convert %2
+      : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
+    %v2 = vector.transfer_read %all_sparse_DCSR[%c0, %c0], %i0
+      : tensor<6x6xi32>, vector<6x6xi32>
+    vector.print %v2 : vector<6x6xi32>
+    
+    //
+    // Should be the same as dense output
+    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
+    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
+    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
+    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
+    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    //
+    %all_sparse_CSR = sparse_tensor.convert %3
+      : tensor<6x6xi32, #CSR> to tensor<6x6xi32>
+    %v3 = vector.transfer_read %all_sparse_CSR[%c0, %c0], %i0
+      : tensor<6x6xi32>, vector<6x6xi32>
+    vector.print %v3 : vector<6x6xi32>
+    
+    //
+    // Should be the same as dense output
+    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
+    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
+    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
+    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
+    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    //
+    %all_sparse_CSC = sparse_tensor.convert %4
+      : tensor<6x6xi32, #CSC> to tensor<6x6xi32>
+    %v4 = vector.transfer_read %all_sparse_CSC[%c0, %c0], %i0
+      : tensor<6x6xi32>, vector<6x6xi32>
+    vector.print %v4 : vector<6x6xi32>
+    
     // Release the resources.
-    bufferization.dealloc_tensor %sparse_filter : tensor<3x3xi32, #DCSR>
+    bufferization.dealloc_tensor %sparse_filter_DCSR : tensor<3x3xi32, #DCSR>
+    bufferization.dealloc_tensor %sparse_filter_CSR : tensor<3x3xi32, #CSR>
+    bufferization.dealloc_tensor %sparse_filter_CSC : tensor<3x3xi32, #CSC>
+    
+    bufferization.dealloc_tensor %sparse_input_DCSR : tensor<8x8xi32, #DCSR>
+    bufferization.dealloc_tensor %sparse_input_CSR : tensor<8x8xi32, #CSR>
+    bufferization.dealloc_tensor %sparse_input_CSC : tensor<8x8xi32, #CSC>
+
     bufferization.dealloc_tensor %1 : tensor<6x6xi32, #DCSR>
+    bufferization.dealloc_tensor %2 : tensor<6x6xi32, #DCSR>
+    bufferization.dealloc_tensor %3 : tensor<6x6xi32, #CSR>
+    bufferization.dealloc_tensor %4 : tensor<6x6xi32, #CSC>
     return
   }
 }
diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -127,7 +127,7 @@
 protected:
   MergerTestBase(unsigned numTensors, unsigned numLoops)
       : numTensors(numTensors), numLoops(numLoops),
-        merger(numTensors, numLoops) {}
+        merger(numTensors, numLoops, /*numFilterLoops=*/0) {}
 
   ///
   /// Expression construction helpers.