diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -28,11 +28,18 @@ using namespace mlir; using namespace mlir::sparse_tensor; +//===----------------------------------------------------------------------===// +// Declarations of data structures. +//===----------------------------------------------------------------------===// + namespace { // Iteration graph sorting. enum SortMask { kSparseOnly = 0x0, kIncludeDense = 0x1, kIncludeUndef = 0x2 }; +// Reduction kinds. +enum Reduction { kSum, kProduct, kAnd, kOr, kXor }; + // Code generation. struct CodeGen { CodeGen(SparsificationOptions o, unsigned numTensors, unsigned numLoops) @@ -68,6 +75,7 @@ // is most effective; we could generalize to more outer and while-loops. unsigned redExp; Value redVal; + Reduction redKind; // Current vector length and mask. unsigned curVecLength; Value curVecMask; @@ -75,8 +83,12 @@ } // namespace -// Helper method to apply dimension ordering permutation. -static unsigned perm(SparseTensorEncodingAttr &enc, unsigned d) { +//===----------------------------------------------------------------------===// +// Sparse compiler analysis methods. +//===----------------------------------------------------------------------===// + +/// Helper method to apply dimension ordering permutation. +static unsigned perm(const SparseTensorEncodingAttr &enc, unsigned d) { if (enc) { auto order = enc.getDimOrdering(); if (order) { @@ -87,8 +99,8 @@ return d; } -// Helper method to translate dim level type to internal representation. -static Dim toDim(SparseTensorEncodingAttr &enc, unsigned d) { +/// Helper method to translate dim level type to internal representation. +static Dim toDim(const SparseTensorEncodingAttr &enc, unsigned d) { if (enc) { SparseTensorEncodingAttr::DimLevelType tp = enc.getDimLevelType()[d]; if (tp == SparseTensorEncodingAttr::DimLevelType::Compressed) @@ -283,6 +295,83 @@ return false; } +//===----------------------------------------------------------------------===// +// Sparse compiler synthesis methods. +//===----------------------------------------------------------------------===// + +/// Maps reduction kind to name encoding. +static StringRef getReductionName(Reduction kind) { + switch (kind) { + case kSum: + return "add"; + case kProduct: + return "mul"; + case kAnd: + return "and"; + case kOr: + return "or"; + case kXor: + return "xor"; + } + llvm_unreachable("unknown reduction kind"); +} + +/// Maps operation to reduction. +static Reduction getReduction(Kind kind) { + switch (kind) { + case Kind::kAddF: + case Kind::kAddI: + case Kind::kSubF: + case Kind::kSubI: + return kSum; + case Kind::kMulF: + case Kind::kMulI: + return kProduct; + case Kind::kAndI: + return kAnd; + case Kind::kOrI: + return kOr; + case Kind::kXorI: + return kXor; + default: + llvm_unreachable("unexpected reduction operator"); + } +} + +/// Generates an initial value for a vector reductions, following the scheme +/// given in Chapter 5 of "The Software Vectorization Handbook", where the +/// initial scalar value is correctly embedded in the vector reduction value, +/// and a straightforward horizontal reduction will complete the operation. +static Value genReductionInit(PatternRewriter &rewriter, Location loc, + Reduction kind, VectorType vtp, Value r) { + switch (kind) { + case kSum: + case kXor: { + // Initialize reduction vector to: | 0 | .. | 0 | r | + Attribute zero = rewriter.getZeroAttr(vtp); + Value vec = rewriter.create(loc, vtp, zero); + return rewriter.create(loc, r, vec, 0); + } + case kProduct: { + // Initialize reduction vector to: | 1 | .. | 1 | r | + Type etp = vtp.getElementType(); + Attribute one; + if (etp.isa()) + one = rewriter.getFloatAttr(etp, 1.0); + else + one = rewriter.getIntegerAttr(etp, 1); + Value vec = + rewriter.create(loc, vtp, DenseElementsAttr::get(vtp, one)); + return rewriter.create(loc, r, vec, 0); + } + case kAnd: + case kOr: + // Initialize reduction vector to: | r | .. | r | r | + return rewriter.create(loc, vtp, r); + } + llvm_unreachable("unknown reduction kind"); +} + /// Maps sparse integer option to actual integral storage type. static Type genIntType(PatternRewriter &rewriter, unsigned width) { if (width == 0) @@ -644,11 +733,15 @@ linalg::GenericOp op) { if (codegen.redVal) return codegen.redVal; // chained with previous for-loop - if (codegen.curVecLength > 1) { - // TODO: assumes + reductions for now + // Generate vector or scalar start of a reduction. + unsigned vl = codegen.curVecLength; + if (vl > 1) { VectorType vtp = vectorType(codegen, codegen.buffers[codegen.redExp]); - return rewriter.create(op.getLoc(), vtp, - rewriter.getZeroAttr(vtp)); + assert(!merger.exp(codegen.redExp).val); + codegen.curVecLength = 1; + Value load = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp); + codegen.curVecLength = vl; + return genReductionInit(rewriter, op.getLoc(), codegen.redKind, vtp, load); } return genTensorLoad(merger, codegen, rewriter, op, codegen.redExp); } @@ -661,19 +754,12 @@ return; assert(codegen.curVecLength == 1); codegen.redVal = merger.exp(codegen.redExp).val = Value(); // end chain + // Generate vector or scalar end of a reduction. if (auto vtp = red.getType().dyn_cast()) { - // TODO: assumes + reductions for now - StringAttr kind = rewriter.getStringAttr("add"); - Value ld = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp); - // Integer reductions don't accept an accumulator. - if (vtp.getElementType().isa()) { - red = rewriter.create(op.getLoc(), ld.getType(), - kind, red, ValueRange{}); - red = rewriter.create(op.getLoc(), red, ld); - } else { - red = rewriter.create(op.getLoc(), ld.getType(), - kind, red, ld); - } + StringRef name = getReductionName(codegen.redKind); + StringAttr kind = rewriter.getStringAttr(name); + red = rewriter.create( + op.getLoc(), vtp.getElementType(), kind, red, ValueRange{}); } genTensorStore(merger, codegen, rewriter, op, red); } @@ -725,7 +811,8 @@ /// Hoists loop invariant tensor loads for which indices have been exhausted. static void genInvariants(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter, linalg::GenericOp op, - unsigned exp, unsigned ldx, bool hoist) { + unsigned exp, unsigned ldx, bool hoist, + Kind last = Kind::kTensor) { if (exp == -1u) return; if (merger.exp(exp).kind == Kind::kTensor) { @@ -743,6 +830,7 @@ OpOperand *lhs = op.getOutputOperand(0); if (lhs == t) { codegen.redExp = hoist ? exp : -1u; + codegen.redKind = getReduction(last); } else if (atLevel) { merger.exp(exp).val = hoist ? genTensorLoad(merger, codegen, rewriter, op, exp) : Value(); @@ -751,10 +839,11 @@ // Traverse into the binary operations. Note that we only hoist // tensor loads, since subsequent MLIR/LLVM passes know how to // deal with all other kinds of derived loop invariants. + Kind last = merger.exp(exp).kind; unsigned e0 = merger.exp(exp).children.e0; unsigned e1 = merger.exp(exp).children.e1; - genInvariants(merger, codegen, rewriter, op, e0, ldx, hoist); - genInvariants(merger, codegen, rewriter, op, e1, ldx, hoist); + genInvariants(merger, codegen, rewriter, op, e0, ldx, hoist, last); + genInvariants(merger, codegen, rewriter, op, e1, ldx, hoist, last); } } @@ -1233,6 +1322,10 @@ rewriter.replaceOp(op, result); } +//===----------------------------------------------------------------------===// +// Sparse compiler rewriting methods. +//===----------------------------------------------------------------------===// + namespace { /// Sparse rewriting rule for generic Lingalg operation. diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir @@ -210,32 +210,38 @@ // // CHECK-VEC1-LABEL: func @reduction_d // CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index +// CHECK-VEC1-DAG: %[[i0:.*]] = constant 0 : i32 // CHECK-VEC1-DAG: %[[c16:.*]] = constant 16 : index // CHECK-VEC1-DAG: %[[c1024:.*]] = constant 1024 : index // CHECK-VEC1-DAG: %[[v0:.*]] = constant dense<0.000000e+00> : vector<16xf32> -// CHECK-VEC1: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[v0]]) -> (vector<16xf32>) { +// CHECK-VEC1: %[[l:.*]] = memref.load %{{.*}}[] : memref +// CHECK-VEC1: %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[i0]] : i32] : vector<16xf32> +// CHECK-VEC1: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) { // CHECK-VEC1: %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref, vector<16xf32> // CHECK-VEC1: %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32> // CHECK-VEC1: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32> // CHECK-VEC1: %[[a:.*]] = addf %[[red_in]], %[[m]] : vector<16xf32> // CHECK-VEC1: scf.yield %[[a]] : vector<16xf32> // CHECK-VEC1: } -// CHECK-VEC1: %{{.*}} = vector.reduction "add", %[[red]], %{{.*}} : vector<16xf32> into f32 +// CHECK-VEC1: %{{.*}} = vector.reduction "add", %[[red]] : vector<16xf32> into f32 // CHECK-VEC1: return // // CHECK-VEC2-LABEL: func @reduction_d // CHECK-VEC2-DAG: %[[c0:.*]] = constant 0 : index +// CHECK-VEC2-DAG: %[[i0:.*]] = constant 0 : i32 // CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index // CHECK-VEC2-DAG: %[[c1024:.*]] = constant 1024 : index // CHECK-VEC2-DAG: %[[v0:.*]] = constant dense<0.000000e+00> : vector<16xf32> -// CHECK-VEC2: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[v0]]) -> (vector<16xf32>) { +// CHECK-VEC2: %[[l:.*]] = memref.load %{{.*}}[] : memref +// CHECK-VEC2: %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[i0]] : i32] : vector<16xf32> +// CHECK-VEC2: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) { // CHECK-VEC2: %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref, vector<16xf32> // CHECK-VEC2: %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32> // CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32> // CHECK-VEC2: %[[a:.*]] = addf %[[red_in]], %[[m]] : vector<16xf32> // CHECK-VEC2: scf.yield %[[a]] : vector<16xf32> // CHECK-VEC2: } -// CHECK-VEC2: %{{.*}} = vector.reduction "add", %[[red]], %{{.*}} : vector<16xf32> into f32 +// CHECK-VEC2: %{{.*}} = vector.reduction "add", %[[red]] : vector<16xf32> into f32 // CHECK-VEC2: return // func @reduction_d(%arga: tensor<1024xf32, #DenseVector>, %argb: tensor<1024xf32>, %argx: tensor) -> tensor { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir @@ -0,0 +1,216 @@ +// RUN: mlir-opt %s \ +// RUN: --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \ +// RUN: --sparsification --sparse-tensor-conversion \ +// RUN: --convert-vector-to-scf --convert-scf-to-std \ +// RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \ +// RUN: --std-bufferize --finalizing-bufferize --lower-affine \ +// RUN: --convert-vector-to-llvm --convert-memref-to-llvm \ +// RUN: --convert-std-to-llvm --reconcile-unrealized-casts | \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s +// +// Do the same run, but now with SIMDization as well. This should not change the outcome. +// +// RUN: mlir-opt %s \ +// RUN: --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \ +// RUN: --sparsification="vectorization-strategy=2 vl=8" --sparse-tensor-conversion \ +// RUN: --convert-vector-to-scf --convert-scf-to-std \ +// RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \ +// RUN: --std-bufferize --finalizing-bufferize --lower-affine \ +// RUN: --convert-vector-to-llvm --convert-memref-to-llvm \ +// RUN: --convert-std-to-llvm --reconcile-unrealized-casts | \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }> +#DV = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }> + +#trait_reduction = { + indexing_maps = [ + affine_map<(i) -> (i)>, // a + affine_map<(i) -> ()> // x (scalar out) + ], + iterator_types = ["reduction"], + doc = "x += OPER_i a(i)" +} + +// An example of vector reductions. +module { + + func @sum_reduction_i32(%arga: tensor<32xi32, #SV>, + %argx: tensor) -> tensor { + %0 = linalg.generic #trait_reduction + ins(%arga: tensor<32xi32, #SV>) + outs(%argx: tensor) { + ^bb(%a: i32, %x: i32): + %0 = addi %x, %a : i32 + linalg.yield %0 : i32 + } -> tensor + return %0 : tensor + } + + func @sum_reduction_f32(%arga: tensor<32xf32, #SV>, + %argx: tensor) -> tensor { + %0 = linalg.generic #trait_reduction + ins(%arga: tensor<32xf32, #SV>) + outs(%argx: tensor) { + ^bb(%a: f32, %x: f32): + %0 = addf %x, %a : f32 + linalg.yield %0 : f32 + } -> tensor + return %0 : tensor + } + + func @prod_reduction_i32(%arga: tensor<32xi32, #DV>, + %argx: tensor) -> tensor { + %0 = linalg.generic #trait_reduction + ins(%arga: tensor<32xi32, #DV>) + outs(%argx: tensor) { + ^bb(%a: i32, %x: i32): + %0 = muli %x, %a : i32 + linalg.yield %0 : i32 + } -> tensor + return %0 : tensor + } + + func @prod_reduction_f32(%arga: tensor<32xf32, #DV>, + %argx: tensor) -> tensor { + %0 = linalg.generic #trait_reduction + ins(%arga: tensor<32xf32, #DV>) + outs(%argx: tensor) { + ^bb(%a: f32, %x: f32): + %0 = mulf %x, %a : f32 + linalg.yield %0 : f32 + } -> tensor + return %0 : tensor + } + + func @and_reduction_i32(%arga: tensor<32xi32, #DV>, + %argx: tensor) -> tensor { + %0 = linalg.generic #trait_reduction + ins(%arga: tensor<32xi32, #DV>) + outs(%argx: tensor) { + ^bb(%a: i32, %x: i32): + %0 = and %x, %a : i32 + linalg.yield %0 : i32 + } -> tensor + return %0 : tensor + } + + func @or_reduction_i32(%arga: tensor<32xi32, #SV>, + %argx: tensor) -> tensor { + %0 = linalg.generic #trait_reduction + ins(%arga: tensor<32xi32, #SV>) + outs(%argx: tensor) { + ^bb(%a: i32, %x: i32): + %0 = or %x, %a : i32 + linalg.yield %0 : i32 + } -> tensor + return %0 : tensor + } + + func @xor_reduction_i32(%arga: tensor<32xi32, #SV>, + %argx: tensor) -> tensor { + %0 = linalg.generic #trait_reduction + ins(%arga: tensor<32xi32, #SV>) + outs(%argx: tensor) { + ^bb(%a: i32, %x: i32): + %0 = xor %x, %a : i32 + linalg.yield %0 : i32 + } -> tensor + return %0 : tensor + } + + func @dump_i32(%arg0 : tensor) { + %m = memref.buffer_cast %arg0 : memref + %v = memref.load %m[] : memref + vector.print %v : i32 + return + } + + func @dump_f32(%arg0 : tensor) { + %m = memref.buffer_cast %arg0 : memref + %v = memref.load %m[] : memref + vector.print %v : f32 + return + } + + func @entry() { + %ri = constant dense< 7 > : tensor + %rf = constant dense< 2.0 > : tensor + + %c_0_i32 = constant dense<[ + 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0 + ]> : tensor<32xi32> + + %c_0_f32 = constant dense<[ + 0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 2.5, 0.0, 0.0, 0.0, + 2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0 + ]> : tensor<32xf32> + + %c_1_i32 = constant dense<[ + 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3 + ]> : tensor<32xi32> + + %c_1_f32 = constant dense<[ + 1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0 + ]> : tensor<32xf32> + + // Convert constants to annotated tensors. + %sparse_input_i32 = sparse_tensor.convert %c_0_i32 + : tensor<32xi32> to tensor<32xi32, #SV> + %sparse_input_f32 = sparse_tensor.convert %c_0_f32 + : tensor<32xf32> to tensor<32xf32, #SV> + %dense_input_i32 = sparse_tensor.convert %c_1_i32 + : tensor<32xi32> to tensor<32xi32, #DV> + %dense_input_f32 = sparse_tensor.convert %c_1_f32 + : tensor<32xf32> to tensor<32xf32, #DV> + + // Call the kernels. + %0 = call @sum_reduction_i32(%sparse_input_i32, %ri) + : (tensor<32xi32, #SV>, tensor) -> tensor + %1 = call @sum_reduction_f32(%sparse_input_f32, %rf) + : (tensor<32xf32, #SV>, tensor) -> tensor + %2 = call @prod_reduction_i32(%dense_input_i32, %ri) + : (tensor<32xi32, #DV>, tensor) -> tensor + %3 = call @prod_reduction_f32(%dense_input_f32, %rf) + : (tensor<32xf32, #DV>, tensor) -> tensor + %4 = call @and_reduction_i32(%dense_input_i32, %ri) + : (tensor<32xi32, #DV>, tensor) -> tensor + %5 = call @or_reduction_i32(%sparse_input_i32, %ri) + : (tensor<32xi32, #SV>, tensor) -> tensor + %6 = call @xor_reduction_i32(%sparse_input_i32, %ri) + : (tensor<32xi32, #SV>, tensor) -> tensor + + // Verify results. + // + // CHECK: 26 + // CHECK: 27.5 + // CHECK: 3087 + // CHECK: 168 + // CHECK: 1 + // CHECK: 15 + // CHECK: 10 + // + call @dump_i32(%0) : (tensor) -> () + call @dump_f32(%1) : (tensor) -> () + call @dump_i32(%2) : (tensor) -> () + call @dump_f32(%3) : (tensor) -> () + call @dump_i32(%4) : (tensor) -> () + call @dump_i32(%5) : (tensor) -> () + call @dump_i32(%6) : (tensor) -> () + + return + } +}