diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
@@ -229,6 +229,8 @@
   AffineMap origIndexingMap = genericOp.getMatchingIndexingMap(opOperand);
   llvm::DenseMap<int64_t, int64_t> domainDimToOperandDim;
   SmallVector<AffineExpr> exprs(origIndexingMap.getResults());
+
+  // If the OpOperand is a scalar or a zero-rank tensor, no need to pack.
   if (genericOp.isScalar(opOperand) || exprs.empty())
     return std::make_tuple(opOperand->get(),
                            AffineMap::get(numLoops, 0, exprs, b.getContext()));
@@ -293,10 +295,10 @@
   return std::make_tuple(packedOperand, indexingMap);
 }
 
-/// Pack an element-wise genericOp and return it.
-static GenericOp packElementWiseOp(RewriterBase &rewriter, GenericOp genericOp,
-                                   Value dest, AffineMap packedOutIndexingMap,
-                                   const PackInfo &packInfo) {
+/// Pack a genericOp and return it.
+static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp,
+                               Value dest, AffineMap packedOutIndexingMap,
+                               const PackInfo &packInfo) {
   Location loc = genericOp.getLoc();
   SmallVector<Value> inputOperands;
   SmallVector<AffineMap> indexingMaps;
@@ -442,8 +444,8 @@
                             .getDefiningOp<tensor::EmptyOp>()) {
     dest = packOpDest;
   }
-  return packElementWiseOp(rewriter, genericOp, dest, packedOutIndexingMap,
-                           *packInfo);
+  return packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap,
+                       *packInfo);
 }
 
 /// Folds pack(fill) into a single fill op if
@@ -527,7 +529,7 @@
   ControlPropagationFn controlFn;
 };
 
-// TODO: Relax this restriction. We should unpack an elementwise also
+// TODO: Relax this restriction. We should unpack a generic op also
 // in the presence of multiple unpack ops as producers.
 /// Return the unpacked operand, if present, for the current generic op.
 static FailureOr<OpOperand *> getUnPackedOperand(GenericOp genericOp) {
@@ -545,7 +547,7 @@
   return unPackedOperand;
 }
 
-/// Push down a tensor.unpack op through elementwise generic op.
+/// Push down a tensor.unpack op through a generic op.
 /// The new generic op works on packed domain; pack ops are created for input
 /// and output operands. A tensor.unpack op is inserted right after the packed
 /// generic. E.g.
@@ -619,8 +621,8 @@
   }
 
   // Pack the genericOp.
-  GenericOp newGenericOp = packElementWiseOp(rewriter, genericOp, dest,
-                                             packedOutIndexingMap, *packInfo);
+  GenericOp newGenericOp =
+      packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap, *packInfo);
   Value newResult =
       newGenericOp.getTiedOpResult(newGenericOp.getDpsInitOperand(0));
 
diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
--- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
+++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
@@ -706,15 +706,16 @@
 
 #map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @reduction_pack_transpose_inner_dims(%arg0: tensor<128x256x32xi32>, %dest: tensor<4x16x16x32xi32>) -> tensor<4x16x16x32xi32>{
-  %init = tensor.empty() : tensor<128x256xi32>
+func.func @reduction_pack_transpose_inner_dims(%arg0: tensor<128x256x32xi32>, 
+      %arg1: tensor<128x256xi32>) -> tensor<4x16x16x32xi32>{
   %elem = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]}
       ins(%arg0 : tensor<128x256x32xi32>)
-      outs(%init : tensor<128x256xi32>) {
+      outs(%arg1 : tensor<128x256xi32>) {
     ^bb0(%arg3: i32, %arg4: i32):
       %4 = arith.addi %arg3, %arg4 : i32
       linalg.yield %4 : i32
   } -> tensor<128x256xi32>
+  %dest = tensor.empty() : tensor<4x16x16x32xi32>
   %pack = tensor.pack %elem
     inner_dims_pos = [1, 0]
     inner_tiles = [16, 32]
@@ -725,7 +726,11 @@
 // CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>
 // CHECK:      func.func @reduction_pack_transpose_inner_dims
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]
-// CHECK-SAME:   %[[DEST:[a-zA-Z0-9]+]]
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK:        %[[ARG1_EMPTY:.+]] = tensor.empty() : tensor<4x16x16x32xi32>
+// CHECK:        %[[PACK_ARG1:.+]] = tensor.pack %[[ARG1]]
+// CHECK-SME:     inner_dims_pos = [1, 0] inner_tiles = [16, 32]
+// CHECK-SAME:    into %[[ARG1_EMPTY]]
 // CHECK:        %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<4x16x32x16x32xi32>
 // CHECK:        %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]]
 // CHECK-SAME:     inner_dims_pos = [1, 0] inner_tiles = [16, 32]
@@ -734,14 +739,14 @@
 // CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]]]
 // CHECK-SAME:     iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel"]
 // CHECK-SAME:     ins(%[[PACK_ARG0]]
-// CHECK-SAME:     outs(%[[DEST]]
+// CHECK-SAME:     outs(%[[PACK_ARG1]]
 // CHECK:        return %[[RED]] : tensor<4x16x16x32xi32>
 
 // -----
 
-func.func @reduction_pack_with_outer_dims(%arg0: tensor<100x128x200x256xi32>, %arg1: tensor<100xi32>, %arg2: tensor<128xi32>) -> tensor<4x16x100x16x32xi32>
+func.func @reduction_pack_with_outer_dims(%arg0: tensor<100x128x200x256xi32>, %arg1: tensor<100xi32>, 
+  %arg2: tensor<128xi32>, %init_reduction: tensor<100x128x256xi32>) -> tensor<4x16x100x16x32xi32>
 {
-  %init_reduction = tensor.empty() : tensor<100x128x256xi32>
   %reduction = linalg.generic {
       indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                        affine_map<(d0, d1, d2, d3) -> (d0)>,
@@ -773,7 +778,11 @@
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]
 // CHECK-SAME:   %[[ARG2:[a-zA-Z0-9]+]]
-// CHECK: %[[INIT_EMPTY:.+]] = tensor.empty() : tensor<4x16x100x16x32xi32>
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9]+]]
+// CHECK: %[[ARG3_EMPTY:.+]] = tensor.empty() : tensor<4x16x100x16x32xi32>
+// CHECK: %[[PACKED_ARG3:.+]] = tensor.pack %[[ARG3]]
+// CHECK-SAME:  outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 32]
+// CHECK-SAME:  into %[[ARG3_EMPTY]]
 // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<4x16x200x100x16x32xi32>
 // CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]]
 // CHECK-SAME:  outer_dims_perm = [1, 3, 2, 0] inner_dims_pos = [3, 1] inner_tiles = [16, 32]
@@ -785,16 +794,16 @@
 // CHECK: %[[RES:.+]] = linalg.generic
 // CHECK-SAME:  indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP2]], #[[MAP3]]]
 // CHECK-SAME:  ins(%[[PACKED_ARG0]], %[[ARG1]], %[[PACKED_ARG2]]
-// CHECK-SAME:  outs(%[[INIT_EMPTY]]
+// CHECK-SAME:  outs(%[[PACKED_ARG3]]
 
 // -----
 
 #map0 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2 * 2 + d4, d3 * 2 + d5)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d3)>
-func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32>) -> tensor<16x540x960xi32>{
+func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32>, 
+    %filter: tensor<2x2xi32>) -> tensor<16x540x960xi32>{
   %init = tensor.empty() : tensor<16x540x960xi32>
-  %filter = tensor.empty() : tensor<2x2xi32>
   %empty = tensor.empty() : tensor<1x16x1080x1920xi32>
   %unpack = tensor.unpack %arg0
       inner_dims_pos = [1]
@@ -804,7 +813,7 @@
       ins(%unpack, %filter : tensor<1x16x1080x1920xi32>, tensor<2x2xi32>)
       outs(%init : tensor<16x540x960xi32>) {
     ^bb0(%in: i32, %in_1: i32, %out: i32):
-      %max = arith.maxui %in, %out : i32
+      %max = arith.maxui %in, %in_1 : i32
       linalg.yield %max : i32
   } -> tensor<16x540x960xi32>
   return %pool : tensor<16x540x960xi32>
@@ -814,7 +823,7 @@
 // CHECK-DAG:  #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d3, d6)>
 // CHECK:      func.func @unpack_different_destination_shape
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]
-// CHECK:        %[[FILTER:.+]] = tensor.empty() : tensor<2x2xi32>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]
 // CHECK:        %[[INIT:.+]] = tensor.empty() : tensor<1x540x960x16xi32>
 // CHECK:        %[[PACK_EMPTY:.+]] = tensor.empty() : tensor<1x1x1080x1920x16xi32>
 // CHECK:        %[[PACK_ARG0:.+]] = tensor.pack
@@ -823,7 +832,7 @@
 // CHECK:        %[[POOL:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
 // CHECK-SAME:     iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "parallel"]
-// CHECK-SAME:     ins(%[[PACK_ARG0]], %[[FILTER]]
+// CHECK-SAME:     ins(%[[PACK_ARG0]], %[[ARG1]]
 // CHECK-SAME:     outs(%[[INIT]]
 // CHECK:        %[[UNPACK_NEW_DEST:.+]] = tensor.empty() : tensor<16x540x960xi32>
 // CHECK:        %[[UNPACK:.+]] = tensor.unpack %[[POOL]]