diff --git a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp @@ -140,7 +140,9 @@ return s; } - /// Optimizes the iteration lattice points in the given set. + /// Optimizes the iteration lattice points in the given set. This + /// method should be called right before code generation to avoid + /// generating redundant loops and conditions. unsigned optimize(unsigned s0) { unsigned s = addSet(); assert(latSets[s0].size() != 0); @@ -148,15 +150,21 @@ for (unsigned p1 : latSets[s0]) { bool add = true; if (p0 != p1) { + // Is this a straightforward copy? + unsigned e = latPoints[p1].exp; + if (exp(e).kind == Kind::kTensor && exp(e).e0 == numTensors - 1) + continue; + // Is any dense index exhausted? llvm::BitVector tmp = latPoints[p1].bits; tmp ^= latPoints[p0].bits; if (hasAnyOf(tmp, false)) - continue; // dense exhausted? + continue; + // Is this a direct duplication of an earlier conjunction? for (unsigned p2 : latSets[s]) { tmp = latPoints[p1].bits; tmp ^= latPoints[p2].bits; if (tmp.count() == 0) { - add = false; // direct dup? + add = false; break; } } diff --git a/mlir/test/Dialect/Linalg/sparse_1d.mlir b/mlir/test/Dialect/Linalg/sparse_1d.mlir --- a/mlir/test/Dialect/Linalg/sparse_1d.mlir +++ b/mlir/test/Dialect/Linalg/sparse_1d.mlir @@ -635,3 +635,48 @@ } -> tensor<32xf32> return %0 : tensor<32xf32> } + +#trait_sum_reduction = { + indexing_maps = [ + affine_map<(i) -> (i)>, // a + affine_map<(i) -> ()> // x (scalar out) + ], + sparse = [ + [ "S" ], // a + [ ] // x + ], + iterator_types = ["reduction"], + doc = "x = SUM_i a(i)" +} + +// CHECK-LABEL: func @sum_reduction( +// CHECK-SAME: %[[VAL_0:.*]]: tensor, +// CHECK-SAME: %[[VAL_1:.*]]: tensor) -> tensor { +// CHECK: %[[VAL_2:.*]] = constant 999 : index +// CHECK: %[[VAL_3:.*]] = constant 0 : index +// CHECK: %[[VAL_4:.*]] = constant 1 : index +// CHECK: %[[VAL_5:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_6:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_8:.*]] = alloca() : memref +// CHECK: %[[VAL_9:.*]] = load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref +// CHECK: %[[VAL_10:.*]] = load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref +// CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] { +// CHECK: %[[VAL_12:.*]] = load %[[VAL_8]][] : memref +// CHECK: %[[VAL_13:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_11]]] : memref +// CHECK: %[[VAL_14:.*]] = addf %[[VAL_12]], %[[VAL_13]] : f32 +// CHECK: store %[[VAL_14]], %[[VAL_8]][] : memref +// CHECK: } +// CHECK: %[[VAL_15:.*]] = tensor_load %[[VAL_8]] : memref +// CHECK: return %[[VAL_15]] : tensor +// CHECK: } +func @sum_reduction(%arga: tensor, %argx: tensor) -> tensor { + %0 = linalg.generic #trait_sum_reduction + ins(%arga : tensor) + init(%argx : tensor) { + ^bb(%a : f32, %x : f32): + %0 = addf %x, %a : f32 + linalg.yield %0: f32 + } -> tensor + return %0 : tensor +} diff --git a/mlir/test/Dialect/Linalg/sparse_2d.mlir b/mlir/test/Dialect/Linalg/sparse_2d.mlir --- a/mlir/test/Dialect/Linalg/sparse_2d.mlir +++ b/mlir/test/Dialect/Linalg/sparse_2d.mlir @@ -1056,3 +1056,131 @@ } -> tensor<16xf32> return %0 : tensor<16xf32> } + +#trait_sum_reduction = { + indexing_maps = [ + affine_map<(i,j) -> (i,j)>, // a + affine_map<(i,j) -> ()> // x (scalar out) + ], + sparse = [ + [ "D","S" ], // a + [ ] // x + ], + iterator_types = ["reduction", "reduction"], + doc = "x = SUM_ij a(i,j)" +} + +// CHECK-LABEL: func @sum_reduction( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<10x20xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor) -> tensor { +// CHECK: %[[VAL_2:.*]] = constant 999 : index +// CHECK: %[[VAL_3:.*]] = constant 10 : index +// CHECK: %[[VAL_4:.*]] = constant 0 : index +// CHECK: %[[VAL_5:.*]] = constant 1 : index +// CHECK: %[[VAL_6:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_8:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_9:.*]] = alloca() : memref +// CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { +// CHECK: %[[VAL_11:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref +// CHECK: %[[VAL_12:.*]] = addi %[[VAL_10]], %[[VAL_5]] : index +// CHECK: %[[VAL_13:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref +// CHECK: scf.for %[[VAL_14:.*]] = %[[VAL_11]] to %[[VAL_13]] step %[[VAL_5]] { +// CHECK: %[[VAL_15:.*]] = load %[[VAL_9]][] : memref +// CHECK: %[[VAL_16:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_14]]] : memref +// CHECK: %[[VAL_17:.*]] = addf %[[VAL_15]], %[[VAL_16]] : f32 +// CHECK: store %[[VAL_17]], %[[VAL_9]][] : memref +// CHECK: } +// CHECK: } +// CHECK: %[[VAL_18:.*]] = tensor_load %[[VAL_9]] : memref +// CHECK: return %[[VAL_18]] : tensor +// CHECK: } +func @sum_reduction(%arga: tensor<10x20xf32>, %argx: tensor) -> tensor { + %0 = linalg.generic #trait_sum_reduction + ins(%arga : tensor<10x20xf32>) + init(%argx : tensor) { + ^bb(%a : f32, %x : f32): + %0 = addf %x, %a : f32 + linalg.yield %0: f32 + } -> tensor + return %0 : tensor +} + +#trait_sampled_dense_dense = { + indexing_maps = [ + affine_map<(i,j,k) -> (i,j)>, // S + affine_map<(i,j,k) -> (i,k)>, // A + affine_map<(i,j,k) -> (k,j)>, // B + affine_map<(i,j,k) -> (i,j)> // X (out) + ], + sparse = [ + [ "S", "S" ], // S + [ "D", "D" ], // A + [ "D", "D" ], // B + [ "D", "D" ] // X + ], + iterator_types = ["parallel", "parallel", "reduction"], + doc = "X(i,j) = S(i,j) SUM_k A(i,k) B(k,j)" +} + +// CHECK-LABEL: func @sampled_dense_dense( +// CHECK-SAME: %[[VAL_0:.*0]]: tensor, +// CHECK-SAME: %[[VAL_1:.*1]]: tensor, +// CHECK-SAME: %[[VAL_2:.*2]]: tensor, +// CHECK-SAME: %[[VAL_3:.*3]]: tensor) -> tensor { +// CHECK: %[[VAL_4:.*]] = constant 999 : index +// CHECK: %[[VAL_5:.*]] = constant 0 : index +// CHECK: %[[VAL_6:.*]] = constant 1 : index +// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_4]]) : memref +// CHECK: %[[VAL_8:.*]] = alloca(%[[VAL_4]]) : memref +// CHECK: %[[VAL_9:.*]] = alloca(%[[VAL_4]]) : memref +// CHECK: %[[VAL_10:.*]] = alloca(%[[VAL_4]]) : memref +// CHECK: %[[VAL_11:.*]] = alloca(%[[VAL_4]]) : memref +// CHECK: %[[VAL_12:.*]] = dim %[[VAL_1]], %[[VAL_5]] : tensor +// CHECK: %[[VAL_13:.*]] = dim %[[VAL_1]], %[[VAL_6]] : tensor +// CHECK: %[[VAL_14:.*]] = alloca(%[[VAL_12]], %[[VAL_13]]) : memref +// CHECK: %[[VAL_15:.*]] = dim %[[VAL_2]], %[[VAL_5]] : tensor +// CHECK: %[[VAL_16:.*]] = dim %[[VAL_2]], %[[VAL_6]] : tensor +// CHECK: %[[VAL_17:.*]] = alloca(%[[VAL_15]], %[[VAL_16]]) : memref +// CHECK: %[[VAL_18:.*]] = dim %[[VAL_3]], %[[VAL_5]] : tensor +// CHECK: %[[VAL_19:.*]] = dim %[[VAL_3]], %[[VAL_6]] : tensor +// CHECK: %[[VAL_20:.*]] = alloca(%[[VAL_18]], %[[VAL_19]]) : memref +// CHECK: %[[VAL_21:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_22:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref +// CHECK: scf.for %[[VAL_23:.*]] = %[[VAL_21]] to %[[VAL_22]] step %[[VAL_6]] { +// CHECK: %[[VAL_24:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_23]]] : memref +// CHECK: scf.for %[[VAL_25:.*]] = %[[VAL_5]] to %[[VAL_15]] step %[[VAL_6]] { +// CHECK: %[[VAL_26:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_23]]] : memref +// CHECK: %[[VAL_27:.*]] = addi %[[VAL_23]], %[[VAL_6]] : index +// CHECK: %[[VAL_28:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_27]]] : memref +// CHECK: scf.for %[[VAL_29:.*]] = %[[VAL_26]] to %[[VAL_28]] step %[[VAL_6]] { +// CHECK: %[[VAL_30:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_29]]] : memref +// CHECK: %[[VAL_31:.*]] = load %[[VAL_20]]{{\[}}%[[VAL_24]], %[[VAL_30]]] : memref +// CHECK: %[[VAL_32:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_29]]] : memref +// CHECK: %[[VAL_33:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_24]], %[[VAL_25]]] : memref +// CHECK: %[[VAL_34:.*]] = load %[[VAL_17]]{{\[}}%[[VAL_25]], %[[VAL_30]]] : memref +// CHECK: %[[VAL_35:.*]] = mulf %[[VAL_33]], %[[VAL_34]] : f32 +// CHECK: %[[VAL_36:.*]] = mulf %[[VAL_32]], %[[VAL_35]] : f32 +// CHECK: %[[VAL_37:.*]] = addf %[[VAL_31]], %[[VAL_36]] : f32 +// CHECK: store %[[VAL_37]], %[[VAL_20]]{{\[}}%[[VAL_24]], %[[VAL_30]]] : memref +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: %[[VAL_38:.*]] = tensor_load %[[VAL_20]] : memref +// CHECK: return %[[VAL_38]] : tensor +// CHECK: } +func @sampled_dense_dense(%args: tensor, + %arga: tensor, + %argb: tensor, + %argx: tensor) -> tensor { + %0 = linalg.generic #trait_sampled_dense_dense + ins(%args, %arga, %argb : tensor, tensor, tensor) + init(%argx : tensor) { + ^bb(%s : f32, %a : f32, %b : f32, %x : f32): + %0 = mulf %a, %b : f32 + %1 = mulf %s, %0 : f32 + %2 = addf %x, %1 : f32 + linalg.yield %2: f32 + } -> tensor + return %0 : tensor +} diff --git a/mlir/test/Dialect/Linalg/sparse_3d.mlir b/mlir/test/Dialect/Linalg/sparse_3d.mlir --- a/mlir/test/Dialect/Linalg/sparse_3d.mlir +++ b/mlir/test/Dialect/Linalg/sparse_3d.mlir @@ -1223,3 +1223,61 @@ } -> tensor return %0 : tensor } +#trait_sum_reduction = { + indexing_maps = [ + affine_map<(i,j,k) -> (i,j,k)>, // a + affine_map<(i,j,k) -> ()> // x (scalar out) + ], + sparse = [ + [ "S", "S", "S" ], // a + [ ] // x + ], + iterator_types = ["reduction", "reduction", "reduction"], + doc = "x = SUM_ijk a(i,j,k)" +} + +// CHECK-LABEL: func @sum_reduction( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<10x20x30xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor) -> tensor { +// CHECK: %[[VAL_2:.*]] = constant 999 : index +// CHECK: %[[VAL_3:.*]] = constant 0 : index +// CHECK: %[[VAL_4:.*]] = constant 1 : index +// CHECK: %[[VAL_5:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_6:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_8:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_9:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_10:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_11:.*]] = alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_12:.*]] = alloca() : memref +// CHECK: %[[VAL_13:.*]] = load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref +// CHECK: %[[VAL_14:.*]] = load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref +// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_13]] to %[[VAL_14]] step %[[VAL_4]] { +// CHECK: %[[VAL_16:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref +// CHECK: %[[VAL_17:.*]] = addi %[[VAL_15]], %[[VAL_4]] : index +// CHECK: %[[VAL_18:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref +// CHECK: scf.for %[[VAL_19:.*]] = %[[VAL_16]] to %[[VAL_18]] step %[[VAL_4]] { +// CHECK: %[[VAL_20:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_19]]] : memref +// CHECK: %[[VAL_21:.*]] = addi %[[VAL_19]], %[[VAL_4]] : index +// CHECK: %[[VAL_22:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_21]]] : memref +// CHECK: scf.for %[[VAL_23:.*]] = %[[VAL_20]] to %[[VAL_22]] step %[[VAL_4]] { +// CHECK: %[[VAL_24:.*]] = load %[[VAL_12]][] : memref +// CHECK: %[[VAL_25:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_23]]] : memref +// CHECK: %[[VAL_26:.*]] = addf %[[VAL_24]], %[[VAL_25]] : f32 +// CHECK: store %[[VAL_26]], %[[VAL_12]][] : memref +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: %[[VAL_27:.*]] = tensor_load %[[VAL_12]] : memref +// CHECK: return %[[VAL_27]] : tensor +// CHECK: } +func @sum_reduction(%arga: tensor<10x20x30xf32>, %argx: tensor) -> tensor { + %0 = linalg.generic #trait_sum_reduction + ins(%arga : tensor<10x20x30xf32>) + init(%argx : tensor) { + ^bb(%a : f32, %x : f32): + %0 = addf %x, %a : f32 + linalg.yield %0: f32 + } -> tensor + return %0 : tensor +}