diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td @@ -1055,18 +1055,22 @@ let summary = "Custom reduction operation utilized within linalg.generic"; let description = [{ Defines a computation with a `linalg.generic` operation that takes two - operands and an identity value and reduces all values down to a single - result based on the computation in the region. + operands and an identity value and reduces all stored values down to a + single result based on the computation in the region. The region must contain exactly one block taking two arguments. The block must end with a sparse_tensor.yield and the output must match the input argument types. - Note that this operation is only required for custom reductions beyond the - standard operations (add, mul, and, or, etc). The `linalg.generic` - `iterator_types` defines which indices are being reduced. When the associated - operands are used in an operation, a reduction will occur. The use of this - explicit `reduce` operation is not required in most cases. + Note that this operation is only required for custom reductions beyond + the standard reduction operations (add, sub, or, xor) that can be + sparsified by merely reducing the stored values. More elaborate reduction + operations (mul, and, min, max, etc.) would need to account for implicit + zeros as well. They can still be handled using this custom reduction + operation. The `linalg.generic` `iterator_types` defines which indices + are being reduced. When the associated operands are used in an operation, + a reduction will occur. The use of this explicit `reduce` operation + is not required in most cases. Example of Matrix->Vector reduction using max(product(x_i), 100): diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -1848,6 +1848,24 @@ if (!findSparseAnnotations(env, idxReducBased)) return failure(); + // Only standard reduction operations (add, sub, or, xor) that can be + // sparsified by merely reducing the stored values are admissible. More + // elaborate reduction operations (such as mul, and, min, max) would need + // to know whether implicit zeros occur as well. They can still be + // implemented with a custom reduction operation, accepted here as well. + if (op.getNumReductionLoops() > 0) { + Operation *yield = op.getRegion().front().getTerminator(); + assert(isa(yield)); + Operation *redop = yield->getOperand(0).getDefiningOp(); + if (!isa(redop) && !isa(redop) && + !isa(redop) && !isa(redop) && + !isa(redop) && !isa(redop) && + !isa(redop) && !isa(redop) && + !isa(redop)) { + return failure(); + } + } + // Constructs the tensor expressions tree from `op`, returns failure if the // tree can not be built or the tensor expression is inadmissible. if (failed(env.initTensorExp())) diff --git a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir --- a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir +++ b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir @@ -5,213 +5,11 @@ // ----- -// Check that we recognize a reduction with a mul operator. -// We use two dimensions here to check that the vectorization -// is not affected by how the outer loop is layed out. -// In other words, we should be able to vectorize the sparse inner loop -// regardless of whether the outer loop is dense or sparse. -// -// For this particular test, we expect: -// With vectorization on: -// dense scf.for -// init vector_accumulator = {scalar_accumulator, 1.0, 1.0, ...} -// sparse scf.for -// vectorized mul in vector_accumulator, vector_input -// horizontal reduction of the vector_accumulator to scalar_accumulator -// final store of scalar_accumulaor -// -// With vectorization off: -// dense scf.for -// sparse scf.for -// mul in accumulator -// final store -// -// CHECK-ON-LABEL: func.func @sparse_product_reduction_dense_sparse( -// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { -// CHECK-ON-DAG: %[[VAL_2:.*]] = arith.constant 8 : index -// CHECK-ON-DAG: %[[VAL_3:.*]] = arith.constant dense<1.000000e+00> : vector<8xf64> -// CHECK-ON-DAG: %[[VAL_4:.*]] = arith.constant dense<0.000000e+00> : vector<8xf64> -// CHECK-ON-DAG: %[[VAL_5:.*]] = arith.constant 0 : index -// CHECK-ON-DAG: %[[VAL_6:.*]] = arith.constant 1 : index -// CHECK-ON-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor> -// CHECK-ON: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor> to memref -// CHECK-ON: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor> to memref -// CHECK-ON: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_0]] : memref -// CHECK-ON: %[[VAL_11:.*]] = memref.load %[[VAL_10]][] : memref -// CHECK-ON: %[[VAL_12:.*]] = scf.for %[[VAL_13:.*]] = %[[VAL_5]] to %[[VAL_7]] step %[[VAL_6]] iter_args(%[[VAL_14:.*]] = %[[VAL_11]]) -> (f64) { -// CHECK-ON: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_13]]] : memref -// CHECK-ON: %[[VAL_16:.*]] = arith.addi %[[VAL_13]], %[[VAL_6]] : index -// CHECK-ON: %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_16]]] : memref -// CHECK-ON: %[[VAL_18:.*]] = vector.insertelement %[[VAL_14]], %[[VAL_3]]{{\[}}%[[VAL_5]] : index] : vector<8xf64> -// CHECK-ON: %[[VAL_19:.*]] = scf.for %[[VAL_20:.*]] = %[[VAL_15]] to %[[VAL_17]] step %[[VAL_2]] iter_args(%[[VAL_21:.*]] = %[[VAL_18]]) -> (vector<8xf64>) { -// CHECK-ON: %[[VAL_22:.*]] = affine.min #map(%[[VAL_17]], %[[VAL_20]]){{\[}}%[[VAL_2]]] -// CHECK-ON: %[[VAL_23:.*]] = vector.create_mask %[[VAL_22]] : vector<8xi1> -// CHECK-ON: %[[VAL_24:.*]] = vector.maskedload %[[VAL_9]]{{\[}}%[[VAL_20]]], %[[VAL_23]], %[[VAL_4]] : memref, vector<8xi1>, vector<8xf64> into vector<8xf64> -// CHECK-ON: %[[VAL_25:.*]] = arith.mulf %[[VAL_21]], %[[VAL_24]] : vector<8xf64> -// CHECK-ON: %[[VAL_26:.*]] = arith.select %[[VAL_23]], %[[VAL_25]], %[[VAL_21]] : vector<8xi1>, vector<8xf64> -// CHECK-ON: scf.yield %[[VAL_26]] : vector<8xf64> -// CHECK-ON: } {"Emitted from" = "linalg.generic"} -// CHECK-ON: %[[VAL_27:.*]] = vector.reduction , %[[VAL_28:.*]] : vector<8xf64> into f64 -// CHECK-ON: scf.yield %[[VAL_27]] : f64 -// CHECK-ON: } {"Emitted from" = "linalg.generic"} -// CHECK-ON: memref.store %[[VAL_29:.*]], %[[VAL_10]][] : memref -// CHECK-ON: %[[VAL_30:.*]] = bufferization.to_tensor %[[VAL_10]] : memref -// CHECK-ON: return %[[VAL_30]] : tensor -// CHECK-ON: } -// -// CHECK-OFF-LABEL: func.func @sparse_product_reduction_dense_sparse( -// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { -// CHECK-OFF-DAG: %[[VAL_2:.*]] = arith.constant 0 : index -// CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index -// CHECK-OFF: %[[VAL_4:.*]] = tensor.dim %[[VAL_1]], %[[VAL_2]] : tensor> -// CHECK-OFF: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor> to memref -// CHECK-OFF: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_1]] : tensor> to memref -// CHECK-OFF: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref -// CHECK-OFF: %[[VAL_8:.*]] = memref.load %[[VAL_7]][] : memref -// CHECK-OFF: %[[VAL_9:.*]] = scf.for %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_3]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (f64) { -// CHECK-OFF: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_10]]] : memref -// CHECK-OFF: %[[VAL_13:.*]] = arith.addi %[[VAL_10]], %[[VAL_3]] : index -// CHECK-OFF: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_13]]] : memref -// CHECK-OFF: %[[VAL_15:.*]] = scf.for %[[VAL_16:.*]] = %[[VAL_12]] to %[[VAL_14]] step %[[VAL_3]] iter_args(%[[VAL_17:.*]] = %[[VAL_11]]) -> (f64) { -// CHECK-OFF: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_16]]] : memref -// CHECK-OFF: %[[VAL_19:.*]] = arith.mulf %[[VAL_17]], %[[VAL_18]] : f64 -// CHECK-OFF: scf.yield %[[VAL_19]] : f64 -// CHECK-OFF: } {"Emitted from" = "linalg.generic"} -// CHECK-OFF: scf.yield %[[VAL_20:.*]] : f64 -// CHECK-OFF: } {"Emitted from" = "linalg.generic"} -// CHECK-OFF: memref.store %[[VAL_21:.*]], %[[VAL_7]][] : memref -// CHECK-OFF: %[[VAL_22:.*]] = bufferization.to_tensor %[[VAL_7]] : memref -// CHECK-OFF: return %[[VAL_22]] : tensor -// CHECK-OFF: } - -#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["dense","compressed"]}> - -#trait = { - indexing_maps = [ - affine_map<(i,j) -> (i,j)>, // a (in) - affine_map<(i,j) -> ()> // x (out) - ], - iterator_types = ["reduction", "reduction"] -} - -func.func @sparse_product_reduction_dense_sparse(%argx: tensor, - %arga: tensor) - -> tensor { - %0 = linalg.generic #trait - ins(%arga: tensor) - outs(%argx: tensor) { - ^bb(%a: f64, %x: f64): - %t = arith.mulf %x, %a: f64 - linalg.yield %t : f64 - } -> tensor - return %0 : tensor -} - -// ----- - -// Same as sparse_product_reduction_dense_sparse but with the outer loop being sparse. -// -// CHECK-ON-LABEL: func.func @sparse_product_reduction_sparse_sparse( -// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { -// CHECK-ON-DAG: %[[VAL_2:.*]] = arith.constant 8 : index -// CHECK-ON-DAG: %[[VAL_3:.*]] = arith.constant dense<1.000000e+00> : vector<8xf64> -// CHECK-ON-DAG: %[[VAL_4:.*]] = arith.constant dense<0.000000e+00> : vector<8xf64> -// CHECK-ON-DAG: %[[VAL_5:.*]] = arith.constant 0 : index -// CHECK-ON-DAG: %[[VAL_6:.*]] = arith.constant 1 : index -// CHECK-ON: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor> to memref -// CHECK-ON: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor> to memref -// CHECK-ON: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor> to memref -// CHECK-ON: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_0]] : memref -// CHECK-ON: %[[VAL_11:.*]] = memref.load %[[VAL_10]][] : memref -// CHECK-ON: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref -// CHECK-ON: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref -// CHECK-ON: %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_6]] iter_args(%[[VAL_16:.*]] = %[[VAL_11]]) -> (f64) { -// CHECK-ON: %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_15]]] : memref -// CHECK-ON: %[[VAL_18:.*]] = arith.addi %[[VAL_15]], %[[VAL_6]] : index -// CHECK-ON: %[[VAL_19:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_18]]] : memref -// CHECK-ON: %[[VAL_20:.*]] = vector.insertelement %[[VAL_16]], %[[VAL_3]]{{\[}}%[[VAL_5]] : index] : vector<8xf64> -// CHECK-ON: %[[VAL_21:.*]] = scf.for %[[VAL_22:.*]] = %[[VAL_17]] to %[[VAL_19]] step %[[VAL_2]] iter_args(%[[VAL_23:.*]] = %[[VAL_20]]) -> (vector<8xf64>) { -// CHECK-ON: %[[VAL_24:.*]] = affine.min #map(%[[VAL_19]], %[[VAL_22]]){{\[}}%[[VAL_2]]] -// CHECK-ON: %[[VAL_25:.*]] = vector.create_mask %[[VAL_24]] : vector<8xi1> -// CHECK-ON: %[[VAL_26:.*]] = vector.maskedload %[[VAL_9]]{{\[}}%[[VAL_22]]], %[[VAL_25]], %[[VAL_4]] : memref, vector<8xi1>, vector<8xf64> into vector<8xf64> -// CHECK-ON: %[[VAL_27:.*]] = arith.mulf %[[VAL_23]], %[[VAL_26]] : vector<8xf64> -// CHECK-ON: %[[VAL_28:.*]] = arith.select %[[VAL_25]], %[[VAL_27]], %[[VAL_23]] : vector<8xi1>, vector<8xf64> -// CHECK-ON: scf.yield %[[VAL_28]] : vector<8xf64> -// CHECK-ON: } {"Emitted from" = "linalg.generic"} -// CHECK-ON: %[[VAL_29:.*]] = vector.reduction , %[[VAL_30:.*]] : vector<8xf64> into f64 -// CHECK-ON: scf.yield %[[VAL_29]] : f64 -// CHECK-ON: } {"Emitted from" = "linalg.generic"} -// CHECK-ON: memref.store %[[VAL_31:.*]], %[[VAL_10]][] : memref -// CHECK-ON: %[[VAL_32:.*]] = bufferization.to_tensor %[[VAL_10]] : memref -// CHECK-ON: return %[[VAL_32]] : tensor -// CHECK-ON: } -// -// CHECK-OFF-LABEL: func.func @sparse_product_reduction_sparse_sparse( -// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { -// CHECK-OFF-DAG: %[[VAL_2:.*]] = arith.constant 0 : index -// CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index -// CHECK-OFF: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor> to memref -// CHECK-OFF: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor> to memref -// CHECK-OFF: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_1]] : tensor> to memref -// CHECK-OFF: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref -// CHECK-OFF: %[[VAL_8:.*]] = memref.load %[[VAL_7]][] : memref -// CHECK-OFF: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref -// CHECK-OFF: %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref -// CHECK-OFF: %[[VAL_11:.*]] = scf.for %[[VAL_12:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_3]] iter_args(%[[VAL_13:.*]] = %[[VAL_8]]) -> (f64) { -// CHECK-OFF: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_12]]] : memref -// CHECK-OFF: %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_3]] : index -// CHECK-OFF: %[[VAL_16:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_15]]] : memref -// CHECK-OFF: %[[VAL_17:.*]] = scf.for %[[VAL_18:.*]] = %[[VAL_14]] to %[[VAL_16]] step %[[VAL_3]] iter_args(%[[VAL_19:.*]] = %[[VAL_13]]) -> (f64) { -// CHECK-OFF: %[[VAL_20:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_18]]] : memref -// CHECK-OFF: %[[VAL_21:.*]] = arith.mulf %[[VAL_19]], %[[VAL_20]] : f64 -// CHECK-OFF: scf.yield %[[VAL_21]] : f64 -// CHECK-OFF: } {"Emitted from" = "linalg.generic"} -// CHECK-OFF: scf.yield %[[VAL_22:.*]] : f64 -// CHECK-OFF: } {"Emitted from" = "linalg.generic"} -// CHECK-OFF: memref.store %[[VAL_23:.*]], %[[VAL_7]][] : memref -// CHECK-OFF: %[[VAL_24:.*]] = bufferization.to_tensor %[[VAL_7]] : memref -// CHECK-OFF: return %[[VAL_24]] : tensor -// CHECK-OFF: } -#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["compressed","compressed"]}> - -#trait = { - indexing_maps = [ - affine_map<(i,j) -> (i,j)>, // a (in) - affine_map<(i,j) -> ()> // x (out) - ], - iterator_types = ["reduction", "reduction"] -} - -func.func @sparse_product_reduction_sparse_sparse(%argx: tensor, - %arga: tensor) - -> tensor { - %0 = linalg.generic #trait - ins(%arga: tensor) - outs(%argx: tensor) { - ^bb(%a: f64, %x: f64): - %t = arith.mulf %x, %a: f64 - linalg.yield %t : f64 - } -> tensor - return %0 : tensor -} - -// ----- - -// sparse_product_reduction_dense_sparse and -// sparse_product_reduction_sparse_sparse established that the outer loop -// doesn't matter for vectorization. -// As a result from this point forward, use tensors with fewer dimensions. - // Check that we vectorize reductions with ori. -// Note: The weird element type here is to check that we create the right -// constant type for the pass-through value. + // CHECK-ON-LABEL: func.func @sparse_reduction_ori( -// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { +// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, +// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { // CHECK-ON-DAG: %[[VAL_2:.*]] = arith.constant 8 : index // CHECK-ON-DAG: %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xi13> // CHECK-ON-DAG: %[[VAL_4:.*]] = arith.constant 0 : index @@ -238,8 +36,8 @@ // CHECK-ON: } // // CHECK-OFF-LABEL: func.func @sparse_reduction_ori( -// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { +// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, +// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { // CHECK-OFF-DAG: %[[VAL_2:.*]] = arith.constant 0 : index // CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-OFF: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor> to memref @@ -268,7 +66,7 @@ } func.func @sparse_reduction_ori(%argx: tensor, - %arga: tensor) + %arga: tensor) -> tensor { %0 = linalg.generic #trait ins(%arga: tensor) @@ -283,13 +81,12 @@ // ----- // Same test as sparse_reduction_ori except that the accumulator is on the -// rhs of the operation. -// This checks that we can recognize a reduction irrespective to where the -// accumalator appears on commutative operations. +// rhs of the operation. This checks that we can recognize a reduction +// irrespective to where the accumulator appears on commutative operations. // CHECK-ON-LABEL: func.func @sparse_reduction_ori_accumulator_on_rhs( -// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { +// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, +// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { // CHECK-ON-DAG: %[[VAL_2:.*]] = arith.constant 8 : index // CHECK-ON-DAG: %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xi13> // CHECK-ON-DAG: %[[VAL_4:.*]] = arith.constant 0 : index @@ -316,8 +113,8 @@ // CHECK-ON: } // // CHECK-OFF-LABEL: func.func @sparse_reduction_ori_accumulator_on_rhs( -// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { +// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, +// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { // CHECK-OFF-DAG: %[[VAL_2:.*]] = arith.constant 0 : index // CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-OFF: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor> to memref @@ -346,7 +143,7 @@ } func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor, - %arga: tensor) + %arga: tensor) -> tensor { %0 = linalg.generic #trait ins(%arga: tensor) @@ -360,11 +157,11 @@ // ----- -// Check that we vectorize reduction with subi. +// Check that we vectorize reductions with subi. // // CHECK-ON-LABEL: func.func @sparse_reduction_subi( -// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { +// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, +// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { // CHECK-ON-DAG: %[[VAL_2:.*]] = arith.constant 8 : index // CHECK-ON-DAG: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK-ON-DAG: %[[VAL_4:.*]] = arith.constant dense<0> : vector<8xi32> @@ -391,8 +188,8 @@ // CHECK-ON: } // // CHECK-OFF-LABEL: func.func @sparse_reduction_subi( -// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { +// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, +// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { // CHECK-OFF-DAG: %[[VAL_2:.*]] = arith.constant 0 : index // CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-OFF: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor> to memref @@ -421,7 +218,7 @@ } func.func @sparse_reduction_subi(%argx: tensor, - %arga: tensor) + %arga: tensor) -> tensor { %0 = linalg.generic #trait ins(%arga: tensor) @@ -435,10 +232,8 @@ // ----- -// From this point forward, we essentially have the same test for all -// arithmetic operation. This is for a code coverage perspective. +// Check that we vectorize reductions with xor. -// Check that we vectorize xor. // CHECK-ON-LABEL: func.func @sparse_reduction_xor( // CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, // CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { @@ -512,156 +307,9 @@ } // ----- -// Check that we vectorize and. -// CHECK-ON-LABEL: func.func @sparse_reduction_and( -// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { -// CHECK-ON-DAG: %[[VAL_2:.*]] = arith.constant 8 : index -// CHECK-ON-DAG: %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xi32> -// CHECK-ON-DAG: %[[VAL_4:.*]] = arith.constant 0 : index -// CHECK-ON-DAG: %[[VAL_5:.*]] = arith.constant 1 : index -// CHECK-ON: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor> to memref -// CHECK-ON: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor> to memref -// CHECK-ON: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref -// CHECK-ON: %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref -// CHECK-ON: %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref -// CHECK-ON: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref -// CHECK-ON: %[[VAL_12:.*]] = vector.broadcast %[[VAL_9]] : i32 to vector<8xi32> -// CHECK-ON: %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xi32>) { -// CHECK-ON: %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]] -// CHECK-ON: %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1> -// CHECK-ON: %[[VAL_18:.*]] = vector.maskedload %[[VAL_7]]{{\[}}%[[VAL_14]]], %[[VAL_17]], %[[VAL_3]] : memref, vector<8xi1>, vector<8xi32> into vector<8xi32> -// CHECK-ON: %[[VAL_19:.*]] = arith.andi %[[VAL_15]], %[[VAL_18]] : vector<8xi32> -// CHECK-ON: %[[VAL_20:.*]] = arith.select %[[VAL_17]], %[[VAL_19]], %[[VAL_15]] : vector<8xi1>, vector<8xi32> -// CHECK-ON: scf.yield %[[VAL_20]] : vector<8xi32> -// CHECK-ON: } {"Emitted from" = "linalg.generic"} -// CHECK-ON: %[[VAL_21:.*]] = vector.reduction , %[[VAL_22:.*]] : vector<8xi32> into i32 -// CHECK-ON: memref.store %[[VAL_21]], %[[VAL_8]][] : memref -// CHECK-ON: %[[VAL_23:.*]] = bufferization.to_tensor %[[VAL_8]] : memref -// CHECK-ON: return %[[VAL_23]] : tensor -// CHECK-ON: } -// -// CHECK-OFF-LABEL: func.func @sparse_reduction_and( -// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { -// CHECK-OFF-DAG: %[[VAL_2:.*]] = arith.constant 0 : index -// CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index -// CHECK-OFF: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor> to memref -// CHECK-OFF: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor> to memref -// CHECK-OFF: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref -// CHECK-OFF: %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref -// CHECK-OFF: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref -// CHECK-OFF: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref -// CHECK-OFF: %[[VAL_10:.*]] = scf.for %[[VAL_11:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] iter_args(%[[VAL_12:.*]] = %[[VAL_7]]) -> (i32) { -// CHECK-OFF: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref -// CHECK-OFF: %[[VAL_14:.*]] = arith.andi %[[VAL_12]], %[[VAL_13]] : i32 -// CHECK-OFF: scf.yield %[[VAL_14]] : i32 -// CHECK-OFF: } {"Emitted from" = "linalg.generic"} -// CHECK-OFF: memref.store %[[VAL_15:.*]], %[[VAL_6]][] : memref -// CHECK-OFF: %[[VAL_16:.*]] = bufferization.to_tensor %[[VAL_6]] : memref -// CHECK-OFF: return %[[VAL_16]] : tensor -// CHECK-OFF: } - -#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["compressed"]}> - -#trait = { - indexing_maps = [ - affine_map<(i) -> (i)>, // a (in) - affine_map<(i) -> ()> // x (out) - ], - iterator_types = ["reduction"] -} - -func.func @sparse_reduction_and(%argx: tensor, - %arga: tensor) - -> tensor { - %0 = linalg.generic #trait - ins(%arga: tensor) - outs(%argx: tensor) { - ^bb(%a: i32, %x: i32): - %t = arith.andi %x, %a: i32 - linalg.yield %t : i32 - } -> tensor - return %0 : tensor -} - -// ----- -// Check that we vectorize muli. -// CHECK-ON-LABEL: func.func @sparse_reduction_muli( -// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { -// CHECK-ON-DAG: %[[VAL_2:.*]] = arith.constant 8 : index -// CHECK-ON-DAG: %[[VAL_3:.*]] = arith.constant dense<1> : vector<8xi32> -// CHECK-ON-DAG: %[[VAL_4:.*]] = arith.constant 0 : index -// CHECK-ON-DAG: %[[VAL_5:.*]] = arith.constant dense<0> : vector<8xi32> -// CHECK-ON-DAG: %[[VAL_6:.*]] = arith.constant 1 : index -// CHECK-ON: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor> to memref -// CHECK-ON: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor> to memref -// CHECK-ON: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_0]] : memref -// CHECK-ON: %[[VAL_10:.*]] = memref.load %[[VAL_9]][] : memref -// CHECK-ON: %[[VAL_11:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref -// CHECK-ON: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref -// CHECK-ON: %[[VAL_13:.*]] = vector.insertelement %[[VAL_10]], %[[VAL_3]]{{\[}}%[[VAL_4]] : index] : vector<8xi32> -// CHECK-ON: %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_2]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (vector<8xi32>) { -// CHECK-ON: %[[VAL_17:.*]] = affine.min #map(%[[VAL_12]], %[[VAL_15]]){{\[}}%[[VAL_2]]] -// CHECK-ON: %[[VAL_18:.*]] = vector.create_mask %[[VAL_17]] : vector<8xi1> -// CHECK-ON: %[[VAL_19:.*]] = vector.maskedload %[[VAL_8]]{{\[}}%[[VAL_15]]], %[[VAL_18]], %[[VAL_5]] : memref, vector<8xi1>, vector<8xi32> into vector<8xi32> -// CHECK-ON: %[[VAL_20:.*]] = arith.muli %[[VAL_16]], %[[VAL_19]] : vector<8xi32> -// CHECK-ON: %[[VAL_21:.*]] = arith.select %[[VAL_18]], %[[VAL_20]], %[[VAL_16]] : vector<8xi1>, vector<8xi32> -// CHECK-ON: scf.yield %[[VAL_21]] : vector<8xi32> -// CHECK-ON: } {"Emitted from" = "linalg.generic"} -// CHECK-ON: %[[VAL_22:.*]] = vector.reduction , %[[VAL_23:.*]] : vector<8xi32> into i32 -// CHECK-ON: memref.store %[[VAL_22]], %[[VAL_9]][] : memref -// CHECK-ON: %[[VAL_24:.*]] = bufferization.to_tensor %[[VAL_9]] : memref -// CHECK-ON: return %[[VAL_24]] : tensor -// CHECK-ON: } -// -// CHECK-OFF-LABEL: func.func @sparse_reduction_muli( -// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { -// CHECK-OFF-DAG: %[[VAL_2:.*]] = arith.constant 0 : index -// CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index -// CHECK-OFF: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor> to memref -// CHECK-OFF: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor> to memref -// CHECK-OFF: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref -// CHECK-OFF: %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref -// CHECK-OFF: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref -// CHECK-OFF: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref -// CHECK-OFF: %[[VAL_10:.*]] = scf.for %[[VAL_11:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] iter_args(%[[VAL_12:.*]] = %[[VAL_7]]) -> (i32) { -// CHECK-OFF: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref -// CHECK-OFF: %[[VAL_14:.*]] = arith.muli %[[VAL_12]], %[[VAL_13]] : i32 -// CHECK-OFF: scf.yield %[[VAL_14]] : i32 -// CHECK-OFF: } {"Emitted from" = "linalg.generic"} -// CHECK-OFF: memref.store %[[VAL_15:.*]], %[[VAL_6]][] : memref -// CHECK-OFF: %[[VAL_16:.*]] = bufferization.to_tensor %[[VAL_6]] : memref -// CHECK-OFF: return %[[VAL_16]] : tensor -// CHECK-OFF: } -#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["compressed"]}> +// Check that we vectorize reductions with addi. -#trait = { - indexing_maps = [ - affine_map<(i) -> (i)>, // a (in) - affine_map<(i) -> ()> // x (out) - ], - iterator_types = ["reduction"] -} - -func.func @sparse_reduction_muli(%argx: tensor, - %arga: tensor) - -> tensor { - %0 = linalg.generic #trait - ins(%arga: tensor) - outs(%argx: tensor) { - ^bb(%a: i32, %x: i32): - %t = arith.muli %x, %a: i32 - linalg.yield %t : i32 - } -> tensor - return %0 : tensor -} - -// ----- -// Check that we vectorize addi. // CHECK-ON-LABEL: func.func @sparse_reduction_addi( // CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, // CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { @@ -722,7 +370,7 @@ } func.func @sparse_reduction_addi(%argx: tensor, - %arga: tensor) + %arga: tensor) -> tensor { %0 = linalg.generic #trait ins(%arga: tensor) @@ -735,7 +383,9 @@ } // ----- -// Check that we vectorize subf. + +// Check that we vectorize reductions with subf. + // CHECK-ON-LABEL: func.func @sparse_reduction_subf( // CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, // CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { @@ -809,10 +459,12 @@ } // ----- -// Check that we vectorize addf. + +// Check that we vectorize reductions with addf. + // CHECK-ON-LABEL: func.func @sparse_reduction_addf( -// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { +// CHECK-ON-SAME: %[[VAL_0:.*]]: tensor, +// CHECK-ON-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { // CHECK-ON-DAG: %[[VAL_2:.*]] = arith.constant 8 : index // CHECK-ON-DAG: %[[VAL_3:.*]] = arith.constant dense<0.000000e+00> : vector<8xf32> // CHECK-ON-DAG: %[[VAL_4:.*]] = arith.constant 0 : index @@ -839,8 +491,8 @@ // CHECK-ON: } // // CHECK-OFF-LABEL: func.func @sparse_reduction_addf( -// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, -// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { +// CHECK-OFF-SAME: %[[VAL_0:.*]]: tensor, +// CHECK-OFF-SAME: %[[VAL_1:.*]]: tensor>) -> tensor { // CHECK-OFF-DAG: %[[VAL_2:.*]] = arith.constant 0 : index // CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-OFF: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor> to memref diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir @@ -28,7 +28,6 @@ // Reduction in this file _are_ supported by the AArch64 SVE backend #SV = #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }> -#DV = #sparse_tensor.encoding<{ lvlTypes = [ "dense" ] }> #trait_reduction = { indexing_maps = [ @@ -66,18 +65,6 @@ return %0 : tensor } - func.func @and_reduction_i32(%arga: tensor<32xi32, #DV>, - %argx: tensor) -> tensor { - %0 = linalg.generic #trait_reduction - ins(%arga: tensor<32xi32, #DV>) - outs(%argx: tensor) { - ^bb(%a: i32, %x: i32): - %0 = arith.andi %x, %a : i32 - linalg.yield %0 : i32 - } -> tensor - return %0 : tensor - } - func.func @or_reduction_i32(%arga: tensor<32xi32, #SV>, %argx: tensor) -> tensor { %0 = linalg.generic #trait_reduction @@ -130,59 +117,37 @@ 2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0 ]> : tensor<32xf32> - %c_1_i32 = arith.constant dense<[ - 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3 - ]> : tensor<32xi32> - - %c_1_f32 = arith.constant dense<[ - 1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0 - ]> : tensor<32xf32> - // Convert constants to annotated tensors. %sparse_input_i32 = sparse_tensor.convert %c_0_i32 : tensor<32xi32> to tensor<32xi32, #SV> %sparse_input_f32 = sparse_tensor.convert %c_0_f32 : tensor<32xf32> to tensor<32xf32, #SV> - %dense_input_i32 = sparse_tensor.convert %c_1_i32 - : tensor<32xi32> to tensor<32xi32, #DV> - %dense_input_f32 = sparse_tensor.convert %c_1_f32 - : tensor<32xf32> to tensor<32xf32, #DV> // Call the kernels. %0 = call @sum_reduction_i32(%sparse_input_i32, %ri) : (tensor<32xi32, #SV>, tensor) -> tensor %1 = call @sum_reduction_f32(%sparse_input_f32, %rf) : (tensor<32xf32, #SV>, tensor) -> tensor - %4 = call @and_reduction_i32(%dense_input_i32, %ri) - : (tensor<32xi32, #DV>, tensor) -> tensor - %5 = call @or_reduction_i32(%sparse_input_i32, %ri) + %2 = call @or_reduction_i32(%sparse_input_i32, %ri) : (tensor<32xi32, #SV>, tensor) -> tensor - %6 = call @xor_reduction_i32(%sparse_input_i32, %ri) + %3 = call @xor_reduction_i32(%sparse_input_i32, %ri) : (tensor<32xi32, #SV>, tensor) -> tensor // Verify results. // // CHECK: 26 // CHECK: 27.5 - // CHECK: 1 // CHECK: 15 // CHECK: 10 // call @dump_i32(%0) : (tensor) -> () call @dump_f32(%1) : (tensor) -> () - call @dump_i32(%4) : (tensor) -> () - call @dump_i32(%5) : (tensor) -> () - call @dump_i32(%6) : (tensor) -> () + call @dump_i32(%2) : (tensor) -> () + call @dump_i32(%3) : (tensor) -> () // Release the resources. bufferization.dealloc_tensor %sparse_input_i32 : tensor<32xi32, #SV> bufferization.dealloc_tensor %sparse_input_f32 : tensor<32xf32, #SV> - bufferization.dealloc_tensor %dense_input_i32 : tensor<32xi32, #DV> - bufferization.dealloc_tensor %dense_input_f32 : tensor<32xf32, #DV> return } diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir @@ -15,10 +15,6 @@ // REDEFINE: %{option} = "enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true" // RUN: %{command} -// Product reductions - kept in a seperate file as these are not supported by -// the AArch64 SVE backend (so the set-up is a bit different to -// sparse_reducitons.mlir) - #SV = #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }> #DV = #sparse_tensor.encoding<{ lvlTypes = [ "dense" ] }> @@ -28,32 +24,76 @@ affine_map<(i) -> ()> // x (scalar out) ], iterator_types = ["reduction"], - doc = "x += OPER_i a(i)" + doc = "x += PROD_CUSTOM_i a(i)" } // An example of vector reductions. module { - func.func @prod_reduction_i32(%arga: tensor<32xi32, #DV>, - %argx: tensor) -> tensor { + func.func @prod_dreduction_i32(%arga: tensor<32xi32, #DV>, + %argx: tensor) -> tensor { + %c = tensor.extract %argx[] : tensor %0 = linalg.generic #trait_reduction ins(%arga: tensor<32xi32, #DV>) outs(%argx: tensor) { - ^bb(%a: i32, %x: i32): - %0 = arith.muli %x, %a : i32 - linalg.yield %0 : i32 + ^bb(%a: i32, %b: i32): + %1 = sparse_tensor.reduce %a, %b, %c : i32 { + ^bb0(%x: i32, %y: i32): + %2 = arith.muli %x, %y : i32 + sparse_tensor.yield %2 : i32 + } + linalg.yield %1 : i32 } -> tensor return %0 : tensor } - func.func @prod_reduction_f32(%arga: tensor<32xf32, #DV>, - %argx: tensor) -> tensor { + func.func @prod_dreduction_f32(%arga: tensor<32xf32, #DV>, + %argx: tensor) -> tensor { + %c = tensor.extract %argx[] : tensor %0 = linalg.generic #trait_reduction ins(%arga: tensor<32xf32, #DV>) outs(%argx: tensor) { - ^bb(%a: f32, %x: f32): - %0 = arith.mulf %x, %a : f32 - linalg.yield %0 : f32 + ^bb(%a: f32, %b: f32): + %1 = sparse_tensor.reduce %a, %b, %c : f32 { + ^bb0(%x: f32, %y: f32): + %2 = arith.mulf %x, %y : f32 + sparse_tensor.yield %2 : f32 + } + linalg.yield %1 : f32 + } -> tensor + return %0 : tensor + } + + func.func @prod_sreduction_i32(%arga: tensor<32xi32, #SV>, + %argx: tensor) -> tensor { + %c = tensor.extract %argx[] : tensor + %0 = linalg.generic #trait_reduction + ins(%arga: tensor<32xi32, #SV>) + outs(%argx: tensor) { + ^bb(%a: i32, %b: i32): + %1 = sparse_tensor.reduce %a, %b, %c : i32 { + ^bb0(%x: i32, %y: i32): + %2 = arith.muli %x, %y : i32 + sparse_tensor.yield %2 : i32 + } + linalg.yield %1 : i32 + } -> tensor + return %0 : tensor + } + + func.func @prod_sreduction_f32(%arga: tensor<32xf32, #SV>, + %argx: tensor) -> tensor { + %c = tensor.extract %argx[] : tensor + %0 = linalg.generic #trait_reduction + ins(%arga: tensor<32xf32, #SV>) + outs(%argx: tensor) { + ^bb(%a: f32, %b: f32): + %1 = sparse_tensor.reduce %a, %b, %c : f32 { + ^bb0(%x: f32, %y: f32): + %2 = arith.mulf %x, %y : f32 + sparse_tensor.yield %2 : f32 + } + linalg.yield %1 : f32 } -> tensor return %0 : tensor } @@ -74,6 +114,20 @@ %ri = arith.constant dense< 7 > : tensor %rf = arith.constant dense< 2.0 > : tensor + // Vectors with a few zeros. + %c_0_i32 = arith.constant dense<[ + 1, 1, 7, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 3, 0, 1, 1, 1, 1, 1, 0, 1, 1, 7, 3 + ]> : tensor<32xi32> + + %c_0_f32 = arith.constant dense<[ + 1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0, + 1.0, 0.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, + 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0 + ]> : tensor<32xf32> + + // Vectors with no zeros. %c_1_i32 = arith.constant dense<[ 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3 @@ -87,28 +141,64 @@ ]> : tensor<32xf32> // Convert constants to annotated tensors. - %dense_input_i32 = sparse_tensor.convert %c_1_i32 + %d0_i32 = sparse_tensor.convert %c_0_i32 : tensor<32xi32> to tensor<32xi32, #DV> - %dense_input_f32 = sparse_tensor.convert %c_1_f32 + %d0_f32 = sparse_tensor.convert %c_0_f32 : tensor<32xf32> to tensor<32xf32, #DV> + %s0_i32 = sparse_tensor.convert %c_0_i32 + : tensor<32xi32> to tensor<32xi32, #SV> + %s0_f32 = sparse_tensor.convert %c_0_f32 + : tensor<32xf32> to tensor<32xf32, #SV> + %d1_i32 = sparse_tensor.convert %c_1_i32 + : tensor<32xi32> to tensor<32xi32, #DV> + %d1_f32 = sparse_tensor.convert %c_1_f32 + : tensor<32xf32> to tensor<32xf32, #DV> + %s1_i32 = sparse_tensor.convert %c_1_i32 + : tensor<32xi32> to tensor<32xi32, #SV> + %s1_f32 = sparse_tensor.convert %c_1_f32 + : tensor<32xf32> to tensor<32xf32, #SV> // Call the kernels. - %2 = call @prod_reduction_i32(%dense_input_i32, %ri) - : (tensor<32xi32, #DV>, tensor) -> tensor - %3 = call @prod_reduction_f32(%dense_input_f32, %rf) - : (tensor<32xf32, #DV>, tensor) -> tensor - - // Verify results. + %0 = call @prod_dreduction_i32(%d0_i32, %ri) : (tensor<32xi32, #DV>, tensor) -> tensor + %1 = call @prod_dreduction_f32(%d0_f32, %rf) : (tensor<32xf32, #DV>, tensor) -> tensor + %2 = call @prod_sreduction_i32(%s0_i32, %ri) : (tensor<32xi32, #SV>, tensor) -> tensor + %3 = call @prod_sreduction_f32(%s0_f32, %rf) : (tensor<32xf32, #SV>, tensor) -> tensor + %4 = call @prod_dreduction_i32(%d1_i32, %ri) : (tensor<32xi32, #DV>, tensor) -> tensor + %5 = call @prod_dreduction_f32(%d1_f32, %rf) : (tensor<32xf32, #DV>, tensor) -> tensor + %6 = call @prod_sreduction_i32(%s1_i32, %ri) : (tensor<32xi32, #SV>, tensor) -> tensor + %7 = call @prod_sreduction_f32(%s1_f32, %rf) : (tensor<32xf32, #SV>, tensor) -> tensor + + // Verify results. Note that the custom reduction gave permission + // to treat an explicit vs implicit zero differently to compute the + // full product reduction. A "standard" product reduction would + // have to return 0 for any implicit zero occurrence too. // + // CHECK: 0 + // CHECK: 3087 + // CHECK: 14 + // CHECK: 3087 + // CHECK: 168 // CHECK: 3087 // CHECK: 168 // + call @dump_i32(%0) : (tensor) -> () + call @dump_f32(%1) : (tensor) -> () call @dump_i32(%2) : (tensor) -> () call @dump_f32(%3) : (tensor) -> () + call @dump_i32(%4) : (tensor) -> () + call @dump_f32(%5) : (tensor) -> () + call @dump_i32(%6) : (tensor) -> () + call @dump_f32(%7) : (tensor) -> () // Release the resources. - bufferization.dealloc_tensor %dense_input_i32 : tensor<32xi32, #DV> - bufferization.dealloc_tensor %dense_input_f32 : tensor<32xf32, #DV> + bufferization.dealloc_tensor %d0_i32 : tensor<32xi32, #DV> + bufferization.dealloc_tensor %d0_f32 : tensor<32xf32, #DV> + bufferization.dealloc_tensor %s0_i32 : tensor<32xi32, #SV> + bufferization.dealloc_tensor %s0_f32 : tensor<32xf32, #SV> + bufferization.dealloc_tensor %d1_i32 : tensor<32xi32, #DV> + bufferization.dealloc_tensor %d1_f32 : tensor<32xf32, #DV> + bufferization.dealloc_tensor %s1_i32 : tensor<32xi32, #SV> + bufferization.dealloc_tensor %s1_f32 : tensor<32xf32, #SV> return }