diff --git a/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp b/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp @@ -15,6 +15,7 @@ #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" @@ -171,11 +172,14 @@ outputExpr.push_back( b.getAffineDimExpr(dim < insertSplitDimension ? dim : dim + 1)); } - Value initTensor = b.create( - loc, newOutputShape, op.getRegionOutputArgs()[0].getType()); + Value allocTensor = b.create( + loc, + RankedTensorType::get(newOutputShape, + op.getRegionOutputArgs()[0].getType()), + ValueRange{}); Value constantOp = b.create(loc, identity); Value identityTensor = - b.create(op->getLoc(), constantOp, initTensor) + b.create(op->getLoc(), constantOp, allocTensor) .getResult(0); newMaps.push_back(AffineMap::get(oldOutputMap.getNumDims() + 1, 0, outputExpr, @@ -189,7 +193,7 @@ // Create the new op matching the original op with an extra parallel // dimension. GenericOp genericOp = b.create( - loc, TypeRange({initTensor.getType()}), newInputs, + loc, TypeRange({allocTensor.getType()}), newInputs, ValueRange({identityTensor}), newMaps, newIteratorTypes); b.inlineRegionBefore(op->getRegion(0), genericOp.region(), genericOp.region().begin()); @@ -297,7 +301,7 @@ return b.notifyMatchFailure(op, "unknown reduction neutral"); // TODO: relax this when multi-reduction support is available. - if (op.getNumOutputs() != neutralElements.size()) + if (op.getNumOutputs() != (int)neutralElements.size()) return b.notifyMatchFailure(op, "expect one reduction per output"); // Rewrite part. @@ -327,8 +331,7 @@ reductionDimSize / splitFactor, insertSplitDimension); SmallVector dims = tensor::createDynamicDimValues(b, loc, rankedTensor); - Value initTensor = b.create( - loc, dims, newT.getShape(), t.getElementType()); + Value initTensor = b.create(loc, newT, dims); Value constantOp = b.create(loc, std::get<1>(it)); fillOps.push_back( b.create(op->getLoc(), constantOp, initTensor)); diff --git a/mlir/test/Dialect/Linalg/split_reduction.mlir b/mlir/test/Dialect/Linalg/split_reduction.mlir --- a/mlir/test/Dialect/Linalg/split_reduction.mlir +++ b/mlir/test/Dialect/Linalg/split_reduction.mlir @@ -15,7 +15,7 @@ // CHECK-DAG: %[[ID:.*]] = arith.constant 0.000000e+00 : f32 // CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<16x256xf32> into tensor<16x4x64xf32> // CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<256x32xf32> into tensor<4x64x32xf32> -// CHECK-DAG: %[[INI:.*]] = linalg.init_tensor [16, 32, 4] : tensor<16x32x4xf32> +// CHECK-DAG: %[[INI:.*]] = bufferization.alloc_tensor() : tensor<16x32x4xf32> // CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<16x32x4xf32>) -> tensor<16x32x4xf32> // CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: , iterator_types = ["parallel", "parallel", "parallel", "reduction"]} @@ -57,7 +57,7 @@ //CHECK-LABEL: @generic_split_1d // CHECK: %[[ID:.*]] = arith.constant 1.000000e+00 : f32 // CHECK: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] : tensor<32xf32> into tensor<4x8xf32> -// CHECK: %[[INI:.*]] = linalg.init_tensor [4] : tensor<4xf32> +// CHECK: %[[INI:.*]] = bufferization.alloc_tensor() : tensor<4xf32> // CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32> // CHECK: %[[G:.*]] = linalg.generic // CHECK: {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]], @@ -103,7 +103,7 @@ // CHECK: %[[ID:.*]] = arith.constant -3.40282347E+38 : f32 // CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<32x2xf32> into tensor<4x8x2xf32> // CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<5x32xf32> into tensor<5x4x8xf32> -// CHECK: %[[INI:.*]] = linalg.init_tensor [5, 2, 4] : tensor<5x2x4xf32> +// CHECK: %[[INI:.*]] = bufferization.alloc_tensor() : tensor<5x2x4xf32> // CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<5x2x4xf32>) -> tensor<5x2x4xf32> // CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "reduction", "parallel", "parallel"]} // CHECK-SAME: ins(%[[I1]], %[[I2]] : tensor<4x8x2xf32>, tensor<5x4x8xf32>) outs(%[[F]] : tensor<5x2x4xf32>) { diff --git a/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir b/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir --- a/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir @@ -3,6 +3,7 @@ // CHECK-LABEL: func.func @matmul_split func.func @matmul_split(%A : tensor, %B: tensor<256x32xf32>, %C: tensor) -> tensor { + // CHECK: bufferization.alloc_tensor({{.*}}) : tensor // CHECK: linalg.generic // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%{{[a-zA-Z0-9]*}}, %{{[a-zA-Z0-9]*}}, %{{[a-zA-Z0-9]*}} : tensor, tensor<256x32xf32>, tensor<64x4xi1>) diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp --- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp +++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" @@ -41,6 +42,7 @@ void getDependentDialects(DialectRegistry ®istry) const override { // clang-format off registry.insert /tmp/aaa.s + +llvm.func @matrix_intrinsics(%A: vector<64 x f64>, %B: vector<64 x f64>) + -> vector<64 x f64> { + %C = llvm.intr.matrix.multiply %A, %B + { lhs_rows = 8: i32, lhs_columns = 8: i32 , rhs_columns = 8: i32} : + (vector<64 x f64>, vector<64 x f64>) -> vector<64 x f64> + llvm.return %C: vector<64 x f64> +} diff --git a/ntv-trunk/PASC-2022-examples/scf-dialect.mlir b/ntv-trunk/PASC-2022-examples/scf-dialect.mlir new file mode 100644 --- /dev/null +++ b/ntv-trunk/PASC-2022-examples/scf-dialect.mlir @@ -0,0 +1,16 @@ +func.func @conditional_reduce(%buffer: memref<1024xf32>, %lb: index, %ub: index, %step: index) -> (f32) { + %sum_0 = arith.constant 0.0 : f32 + %c0 = arith.constant 0.0 : f32 + %sum = scf.for %iv = %lb to %ub step %step iter_args(%sum_iter = %sum_0) -> (f32) { + %t = memref.load %buffer[%iv] : memref<1024xf32> + %cond = arith.cmpf ugt, %t, %c0 : f32 + %sum_next = scf.if %cond -> (f32) { + %new_sum = arith.addf %sum_iter, %t : f32 + scf.yield %new_sum : f32 + } else { + scf.yield %sum_iter : f32 + } + scf.yield %sum_next : f32 + } + return %sum : f32 +} diff --git a/ntv-trunk/PASC-2022-examples/transform-dialect.mlir b/ntv-trunk/PASC-2022-examples/transform-dialect.mlir new file mode 100644 --- /dev/null +++ b/ntv-trunk/PASC-2022-examples/transform-dialect.mlir @@ -0,0 +1,55 @@ +// RUN: iree-opt %s + +transform.with_pdl_patterns { +^bb0(%arg0: !pdl.operation): + pdl.pattern @pdl_generic_target : benefit(1) { + %args = operands + %results = types + %0 = operation "linalg.generic"(%args : !pdl.range) -> (%results : !pdl.range) + // TODO: we don't want this, but it is the required terminator for pdl.pattern + rewrite %0 with "transform.dialect" + } + pdl.pattern @pdl_if_op_target : benefit(1) { + %args = operands + %results = types + %0 = operation "scf.if"(%args : !pdl.range) -> (%results : !pdl.range) + // TODO: we don't want this, but it is the required terminator for pdl.pattern + rewrite %0 with "transform.dialect" + } + pdl.pattern @pdl_foreach_thread_op_target : benefit(1) { + %args = operands + %results = types + %0 = operation "scf.foreach_thread"(%args : !pdl.range) -> (%results : !pdl.range) + // TODO: we don't want this, but it is the required terminator for pdl.pattern + rewrite %0 with "transform.dialect" + } + pdl.pattern @pdl_warp_execute_on_lane_0_op_target : benefit(1) { + %args = operands + %results = types + %0 = operation "vector.warp_execute_on_lane_0"(%args : !pdl.range) -> (%results : !pdl.range) + // TODO: we don't want this, but it is the required terminator for pdl.pattern + rewrite %0 with "transform.dialect" + } + + transform.structured.canonicalized_sequence %arg0 { + ^bb1(%arg1: !pdl.operation): + %0 = pdl_match @pdl_generic_target in %arg1 + %fill_op, %more_parallel_op, %combiner_op = transform.structured.split_reduction %0 + { split_factor = 2, insert_split_dimension = 1} + + %foreach_thread_2, %tiled_more_parallel_op = + tile_to_foreach_thread_op %more_parallel_op {num_threads = [4, 2], thread_dim_mapping = [2, 1, 0]} + + %isolated = transform.get_closest_isolated_parent %foreach_thread_2 + transform.structured.vectorize %isolated + + transform.iree.bufferize + %some_foreach_thread = pdl_match @pdl_foreach_thread_op_target in %arg1 + %isolated_handle_2 = transform.get_closest_isolated_parent %some_foreach_thread + transform.iree.foreach_thread_to_gpu_and_translation_info %isolated_handle_2 { workgroup_size = [32, 2, 4] } + + %warp = pdl_match @pdl_warp_execute_on_lane_0_op_target in %arg1 + %isolated_2 = transform.get_closest_isolated_parent %warp + transform.iree.vector.warp_distribute %isolated_2 + } +} diff --git a/ntv-trunk/PASC-2022-examples/vector-dialect.mlir b/ntv-trunk/PASC-2022-examples/vector-dialect.mlir new file mode 100644 --- /dev/null +++ b/ntv-trunk/PASC-2022-examples/vector-dialect.mlir @@ -0,0 +1,11 @@ +// mlir-opt -test-vector-multi-reduction-lowering-patterns="use-outer-reductions" ntv-trunk/PASC-2022-examples/vector-dialect.mlir +// +// mlir-opt -test-vector-multi-reduction-lowering-patterns="use-outer-reductions" ntv-trunk/PASC-2022-examples/vector-dialect.mlir | \ +// mlir-opt -convert-vector-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -canonicalize -cse | \ +// mlir-translate --mlir-to-llvmir | \ +// opt -O3 -enable-matrix -matrix-allow-contract -mtriple=x86_64-grtev4-linux-gnu -mcpu=skylake-avx512 | \ +// llc -O3 -mtriple=x86_64-grtev4-linux-gnu -mcpu=skylake-avx512 +func.func @vector_reduction_outer(%arg0: vector<2x3x4x5xi32>) -> vector<2x3xi32> { + %0 = vector.multi_reduction , %arg0 [2, 3] : vector<2x3x4x5xi32> to vector<2x3xi32> + return %0 : vector<2x3xi32> +}