diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h @@ -12,6 +12,7 @@ #include "mlir/Dialect/PDL/IR/PDLTypes.h" #include "mlir/Dialect/Transform/IR/TransformInterfaces.h" #include "mlir/IR/OpImplementation.h" +#include "mlir/IR/RegionKindInterface.h" namespace mlir { class TilingInterface; diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -17,6 +17,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/EnumAttr.td" include "mlir/IR/OpBase.td" +include "mlir/IR/RegionKindInterface.td" def DecomposeOp : Op, + DeclareOpInterfaceMethods] # GraphRegionNoTerminator.traits> { + let description = [{ + Replace all `target` payload ops with the single op that is contained in + this op's region. All targets must have zero arguments and must be isolated + from above. + + This op is for debugging/experiments only. + + #### Return modes + + This operation consumes the `target` handle. + }]; + + let arguments = (ins PDL_Operation:$target); + let results = (outs PDL_Operation:$replacement); + let regions = (region SizedRegion<1>:$bodyRegion); + let assemblyFormat = "$target attr-dict-with-keyword regions"; + let hasVerifier = 1; +} + def ScalarizeOp : Op { diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -19,6 +19,7 @@ #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h" #include "mlir/Dialect/Transform/IR/TransformDialect.h" #include "mlir/Dialect/Transform/IR/TransformInterfaces.h" +#include "mlir/IR/OpDefinition.h" #include "mlir/Interfaces/TilingInterface.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/StringSet.h" @@ -883,6 +884,64 @@ return DiagnosedSilenceableFailure(success()); } +//===----------------------------------------------------------------------===// +// ReplaceOp +//===----------------------------------------------------------------------===// + +DiagnosedSilenceableFailure +transform::ReplaceOp::apply(TransformResults &transformResults, + TransformState &state) { + ArrayRef payload = state.getPayloadOps(getTarget()); + + // Check for invalid targets. + for (Operation *target : payload) { + if (target->getNumOperands() > 0) + return emitDefiniteFailure() << "expected target without operands"; + if (!target->hasTrait() && target->getNumRegions() > 0) + return emitDefiniteFailure() + << "expected target that is isloated from above"; + } + + // Clone and replace. + IRRewriter rewriter(getContext()); + Operation *pattern = &getBodyRegion().front().front(); + SmallVector replacements; + for (Operation *target : payload) { + if (getOperation()->isAncestor(target)) + continue; + rewriter.setInsertionPoint(target); + Operation *replacement = rewriter.clone(*pattern); + rewriter.replaceOp(target, replacement->getResults()); + replacements.push_back(replacement); + } + transformResults.set(getReplacement().cast(), replacements); + return DiagnosedSilenceableFailure(success()); +} + +void transform::ReplaceOp::getEffects( + SmallVectorImpl &effects) { + consumesHandle(getTarget(), effects); + producesHandle(getReplacement(), effects); + modifiesPayload(effects); +} + +LogicalResult transform::ReplaceOp::verify() { + if (!getBodyRegion().hasOneBlock()) + return emitOpError() << "expected one block"; + if (std::distance(getBodyRegion().front().begin(), + getBodyRegion().front().end()) != 1) + return emitOpError() << "expected one operation in block"; + Operation *replacement = &getBodyRegion().front().front(); + if (replacement->getNumOperands() > 0) + return replacement->emitOpError() + << "expected replacement without operands"; + if (!replacement->hasTrait() && + replacement->getNumRegions() > 0) + return replacement->emitOpError() + << "expect op that is isolated from above"; + return success(); +} + //===----------------------------------------------------------------------===// // ScalarizeOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Linalg/transform-op-replace.mlir b/mlir/test/Dialect/Linalg/transform-op-replace.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Linalg/transform-op-replace.mlir @@ -0,0 +1,50 @@ +// RUN: mlir-opt -test-transform-dialect-interpreter %s -allow-unregistered-dialect -verify-diagnostics --split-input-file | FileCheck %s + +// CHECK: func.func @foo() { +// CHECK: "dummy_op"() : () -> () +// CHECK: } +// CHECK-NOT: func.func @bar +func.func @bar() { + "another_op"() : () -> () +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !pdl.operation): + %0 = transform.structured.match ops{["func.func"]} in %arg1 + transform.structured.replace %0 { + func.func @foo() { + "dummy_op"() : () -> () + } + } +} + +// ----- + +func.func @bar(%arg0: i1) { + "another_op"(%arg0) : (i1) -> () +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !pdl.operation): + %0 = transform.structured.match ops{["another_op"]} in %arg1 + // expected-error @+1 {{expected target without operands}} + transform.structured.replace %0 { + "dummy_op"() : () -> () + } +} + +// ----- + +func.func @bar() { + "another_op"() : () -> () +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !pdl.operation): + %0 = transform.structured.match ops{["another_op"]} in %arg1 + transform.structured.replace %0 { + ^bb0(%a: i1): + // expected-error @+1 {{expected replacement without operands}} + "dummy_op"(%a) : (i1) -> () + } +} diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir --- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir +++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir @@ -1,855 +1,12 @@ -// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if | FileCheck %s --check-prefix=CHECK-SCF-IF -// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute="hoist-uniform" | FileCheck --check-prefixes=CHECK-HOIST %s -// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" | FileCheck --check-prefixes=CHECK-D %s -// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute=propagate-distribution -canonicalize | FileCheck --check-prefixes=CHECK-PROP %s -// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize | FileCheck --check-prefixes=CHECK-DIST-AND-PROP %s +// RUN: mlir-opt %s -allow-unregistered-dialect -// CHECK-SCF-IF-DAG: #[[$TIMES2:.*]] = affine_map<()[s0] -> (s0 * 2)> -// CHECK-SCF-IF-DAG: #[[$TIMES4:.*]] = affine_map<()[s0] -> (s0 * 4)> -// CHECK-SCF-IF-DAG: #[[$TIMES8:.*]] = affine_map<()[s0] -> (s0 * 8)> -// CHECK-SCF-IF-DAG: memref.global "private" @__shared_32xf32 : memref<32xf32, 3> -// CHECK-SCF-IF-DAG: memref.global "private" @__shared_64xf32 : memref<64xf32, 3> -// CHECK-SCF-IF-DAG: memref.global "private" @__shared_128xf32 : memref<128xf32, 3> -// CHECK-SCF-IF-DAG: memref.global "private" @__shared_256xf32 : memref<256xf32, 3> - -// CHECK-SCF-IF-LABEL: func @rewrite_warp_op_to_scf_if( -// CHECK-SCF-IF-SAME: %[[laneid:.*]]: index, -// CHECK-SCF-IF-SAME: %[[v0:.*]]: vector<4xf32>, %[[v1:.*]]: vector<8xf32>) -func.func @rewrite_warp_op_to_scf_if(%laneid: index, - %v0: vector<4xf32>, %v1: vector<8xf32>) { -// CHECK-SCF-IF-DAG: %[[c0:.*]] = arith.constant 0 : index -// CHECK-SCF-IF: %[[is_lane_0:.*]] = arith.cmpi eq, %[[laneid]], %[[c0]] - -// CHECK-SCF-IF: %[[buffer_v0:.*]] = memref.get_global @__shared_128xf32 -// CHECK-SCF-IF: %[[s0:.*]] = affine.apply #[[$TIMES4]]()[%[[laneid]]] -// CHECK-SCF-IF: vector.transfer_write %[[v0]], %[[buffer_v0]][%[[s0]]] -// CHECK-SCF-IF: %[[buffer_v1:.*]] = memref.get_global @__shared_256xf32 -// CHECK-SCF-IF: %[[s1:.*]] = affine.apply #[[$TIMES8]]()[%[[laneid]]] -// CHECK-SCF-IF: vector.transfer_write %[[v1]], %[[buffer_v1]][%[[s1]]] - -// CHECK-SCF-IF-DAG: gpu.barrier -// CHECK-SCF-IF-DAG: %[[buffer_def_0:.*]] = memref.get_global @__shared_32xf32 -// CHECK-SCF-IF-DAG: %[[buffer_def_1:.*]] = memref.get_global @__shared_64xf32 - -// CHECK-SCF-IF: scf.if %[[is_lane_0]] { - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] - args(%v0, %v1 : vector<4xf32>, vector<8xf32>) -> (vector<1xf32>, vector<2xf32>) { - ^bb0(%arg0: vector<128xf32>, %arg1: vector<256xf32>): -// CHECK-SCF-IF: %[[arg1:.*]] = vector.transfer_read %[[buffer_v1]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<256xf32, 3>, vector<256xf32> -// CHECK-SCF-IF: %[[arg0:.*]] = vector.transfer_read %[[buffer_v0]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<128xf32, 3>, vector<128xf32> -// CHECK-SCF-IF: %[[def_0:.*]] = "some_def"(%[[arg0]]) : (vector<128xf32>) -> vector<32xf32> -// CHECK-SCF-IF: %[[def_1:.*]] = "some_def"(%[[arg1]]) : (vector<256xf32>) -> vector<64xf32> - %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32> - %3 = "some_def"(%arg1) : (vector<256xf32>) -> vector<64xf32> -// CHECK-SCF-IF: vector.transfer_write %[[def_0]], %[[buffer_def_0]][%[[c0]]] -// CHECK-SCF-IF: vector.transfer_write %[[def_1]], %[[buffer_def_1]][%[[c0]]] - vector.yield %2, %3 : vector<32xf32>, vector<64xf32> - } -// CHECK-SCF-IF: } -// CHECK-SCF-IF: gpu.barrier -// CHECK-SCF-IF: %[[o1:.*]] = affine.apply #[[$TIMES2]]()[%[[laneid]]] -// CHECK-SCF-IF: %[[r1:.*]] = vector.transfer_read %[[buffer_def_1]][%[[o1]]], %{{.*}} {in_bounds = [true]} : memref<64xf32, 3>, vector<2xf32> -// CHECK-SCF-IF: %[[r0:.*]] = vector.transfer_read %[[buffer_def_0]][%[[laneid]]], %{{.*}} {in_bounds = [true]} : memref<32xf32, 3>, vector<1xf32> -// CHECK-SCF-IF: "some_use"(%[[r0]]) : (vector<1xf32>) -> () -// CHECK-SCF-IF: "some_use"(%[[r1]]) : (vector<2xf32>) -> () - "some_use"(%r#0) : (vector<1xf32>) -> () - "some_use"(%r#1) : (vector<2xf32>) -> () - return -} - -// ----- - -// CHECK-D-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 * 2 + 32)> - -// CHECK-DIST-AND-PROP-LABEL: func @warp( -// CHECK-HOIST: memref.subview -// CHECK-HOIST: memref.subview -// CHECK-HOIST: memref.subview -// CHECK-HOIST: vector.warp_execute_on_lane_0 - -// CHECK-D: %[[R:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>, vector<1xf32>) { -// CHECK-D: arith.addf {{.*}} : vector<32xf32> -// CHECK-D: arith.addf {{.*}} : vector<64xf32> -// CHECK-D: vector.yield %{{.*}}, %{{.*}} : vector<64xf32>, vector<32xf32> -// CHECK-D-DAG: vector.transfer_write %[[R]]#1, %{{.*}}[%{{.*}}] {in_bounds = [true]} : vector<1xf32>, memref<128xf32 -// CHECK-D-DAG: %[[ID1:.*]] = affine.apply #[[MAP1]]()[%{{.*}}] -// CHECK-D-DAG: vector.transfer_write %[[R]]#0, %{{.*}}[%[[ID1]]] {in_bounds = [true]} : vector<2xf32>, memref<128xf32 - -// CHECK-DIST-AND-PROP-NOT: vector.warp_execute_on_lane_0 -// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32> -// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32> -// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<2xf32> -// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<2xf32> -// CHECK-DIST-AND-PROP: arith.addf {{.*}} : vector<1xf32> -// CHECK-DIST-AND-PROP: arith.addf {{.*}} : vector<2xf32> -// CHECK-DIST-AND-PROP: vector.transfer_write {{.*}} : vector<1xf32> -// CHECK-DIST-AND-PROP: vector.transfer_write {{.*}} : vector<2xf32> - -func.func @warp(%laneid: index, %arg1: memref<1024xf32>, %arg2: memref<1024xf32>, - %arg3: memref<1024xf32>, %gid : index) { - vector.warp_execute_on_lane_0(%laneid)[32] { - %sa = memref.subview %arg1[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>> - %sb = memref.subview %arg2[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>> - %sc = memref.subview %arg3[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>> - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %cst = arith.constant 0.000000e+00 : f32 - %2 = vector.transfer_read %sa[%c0], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<32xf32> - %3 = vector.transfer_read %sa[%c32], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<32xf32> - %4 = vector.transfer_read %sb[%c0], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<64xf32> - %5 = vector.transfer_read %sb[%c32], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<64xf32> - %6 = arith.addf %2, %3 : vector<32xf32> - %7 = arith.addf %4, %5 : vector<64xf32> - vector.transfer_write %6, %sc[%c0] : vector<32xf32>, memref<128xf32, strided<[1], offset: ?>> - vector.transfer_write %7, %sc[%c32] : vector<64xf32>, memref<128xf32, strided<[1], offset: ?>> - } - return -} - -// ----- - -// CHECK-D-LABEL: func @warp_extract( -// CHECK-D: %[[WARPOP:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1x1xf32>) -// CHECK-D: "test.dummy_op" -// CHECK-D: "test.dummy_op" -// CHECK-D: vector.yield %{{.*}}, %{{.*}} : vector<1xf32>, vector<1x1xf32> -// CHECK-D: } -// CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] { -// CHECK-D: vector.transfer_write %[[WARPOP]]#1, %{{.*}}[%{{.*}}] {{.*}} : vector<1x1xf32> -// CHECK-D: } -// CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] { -// CHECK-D: vector.transfer_write %[[WARPOP]]#0, %{{.*}}[%{{.*}}] {{.*}} : vector<1xf32> -// CHECK-D: } - -func.func @warp_extract(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) { - vector.warp_execute_on_lane_0(%laneid)[32] { - %c0 = arith.constant 0 : index - %v = "test.dummy_op"() : () -> (vector<1xf32>) - %v1 = "test.dummy_op"() : () -> (vector<1x1xf32>) - vector.transfer_write %v1, %arg1[%c0, %c0] : vector<1x1xf32>, memref<1024x1024xf32> - vector.transfer_write %v, %arg1[%c0, %c0] : vector<1xf32>, memref<1024x1024xf32> - } - return -} - -// ----- - -// CHECK-PROP-LABEL: func @warp_dead_result( -func.func @warp_dead_result(%laneid: index) -> (vector<1xf32>) { - // CHECK-PROP: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) - %r:3 = vector.warp_execute_on_lane_0(%laneid)[32] -> - (vector<1xf32>, vector<1xf32>, vector<1xf32>) { - %2 = "some_def"() : () -> (vector<32xf32>) - %3 = "some_def"() : () -> (vector<32xf32>) - %4 = "some_def"() : () -> (vector<32xf32>) - // CHECK-PROP: vector.yield %{{.*}} : vector<32xf32> - vector.yield %2, %3, %4 : vector<32xf32>, vector<32xf32>, vector<32xf32> - } - // CHECK-PROP: return %[[R]] : vector<1xf32> - return %r#1 : vector<1xf32> -} - -// ----- - -// CHECK-PROP-LABEL: func @warp_propagate_operand( -// CHECK-PROP-SAME: %[[ID:.*]]: index, %[[V:.*]]: vector<4xf32>) -func.func @warp_propagate_operand(%laneid: index, %v0: vector<4xf32>) - -> (vector<4xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] - args(%v0 : vector<4xf32>) -> (vector<4xf32>) { - ^bb0(%arg0 : vector<128xf32>) : - vector.yield %arg0 : vector<128xf32> - } - // CHECK-PROP: return %[[V]] : vector<4xf32> - return %r : vector<4xf32> -} - -// ----- - -#map0 = affine_map<()[s0] -> (s0 * 2)> - -// CHECK-PROP-LABEL: func @warp_propagate_elementwise( -func.func @warp_propagate_elementwise(%laneid: index, %dest: memref<1024xf32>) { - %c0 = arith.constant 0 : index - %c32 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - // CHECK-PROP: %[[R:.*]]:4 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>, vector<2xf32>, vector<2xf32>) - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] -> - (vector<1xf32>, vector<2xf32>) { - // CHECK-PROP: %[[V0:.*]] = "some_def"() : () -> vector<32xf32> - // CHECK-PROP: %[[V1:.*]] = "some_def"() : () -> vector<32xf32> - // CHECK-PROP: %[[V2:.*]] = "some_def"() : () -> vector<64xf32> - // CHECK-PROP: %[[V3:.*]] = "some_def"() : () -> vector<64xf32> - // CHECK-PROP: vector.yield %[[V0]], %[[V1]], %[[V2]], %[[V3]] : vector<32xf32>, vector<32xf32>, vector<64xf32>, vector<64xf32> - %2 = "some_def"() : () -> (vector<32xf32>) - %3 = "some_def"() : () -> (vector<32xf32>) - %4 = "some_def"() : () -> (vector<64xf32>) - %5 = "some_def"() : () -> (vector<64xf32>) - %6 = arith.addf %2, %3 : vector<32xf32> - %7 = arith.addf %4, %5 : vector<64xf32> - vector.yield %6, %7 : vector<32xf32>, vector<64xf32> - } - // CHECK-PROP: %[[A0:.*]] = arith.addf %[[R]]#2, %[[R]]#3 : vector<2xf32> - // CHECK-PROP: %[[A1:.*]] = arith.addf %[[R]]#0, %[[R]]#1 : vector<1xf32> - %id2 = affine.apply #map0()[%laneid] - // CHECK-PROP: vector.transfer_write %[[A1]], {{.*}} : vector<1xf32>, memref<1024xf32> - // CHECK-PROP: vector.transfer_write %[[A0]], {{.*}} : vector<2xf32>, memref<1024xf32> - vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32> - vector.transfer_write %r#1, %dest[%id2] : vector<2xf32>, memref<1024xf32> - return -} - -// ----- - -// CHECK-PROP-LABEL: func @warp_propagate_scalar_arith( -// CHECK-PROP: %[[r:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} { -// CHECK-PROP: %[[some_def0:.*]] = "some_def" -// CHECK-PROP: %[[some_def1:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[some_def0]], %[[some_def1]] -// CHECK-PROP: } -// CHECK-PROP: arith.addf %[[r]]#0, %[[r]]#1 : f32 -func.func @warp_propagate_scalar_arith(%laneid: index) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { - %0 = "some_def"() : () -> (f32) - %1 = "some_def"() : () -> (f32) - %2 = arith.addf %0, %1 : f32 - vector.yield %2 : f32 - } - vector.print %r : f32 - return -} - -// ----- - -// CHECK-PROP-LABEL: func @warp_propagate_cast( -// CHECK-PROP-NOT: vector.warp_execute_on_lane_0 -// CHECK-PROP: %[[result:.*]] = arith.sitofp %{{.*}} : i32 to f32 -// CHECK-PROP: return %[[result]] -func.func @warp_propagate_cast(%laneid : index, %i : i32) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { - %casted = arith.sitofp %i : i32 to f32 - vector.yield %casted : f32 - } - return %r : f32 -} - -// ----- - -#map0 = affine_map<()[s0] -> (s0 * 2)> - -// CHECK-PROP-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 2)> - -// CHECK-PROP: func @warp_propagate_read -// CHECK-PROP-SAME: (%[[ID:.*]]: index -func.func @warp_propagate_read(%laneid: index, %src: memref<1024xf32>, %dest: memref<1024xf32>) { -// CHECK-PROP-NOT: warp_execute_on_lane_0 -// CHECK-PROP-DAG: %[[R0:.*]] = vector.transfer_read %arg1[%[[ID]]], %{{.*}} : memref<1024xf32>, vector<1xf32> -// CHECK-PROP-DAG: %[[ID2:.*]] = affine.apply #[[MAP0]]()[%[[ID]]] -// CHECK-PROP-DAG: %[[R1:.*]] = vector.transfer_read %arg1[%[[ID2]]], %{{.*}} : memref<1024xf32>, vector<2xf32> -// CHECK-PROP: vector.transfer_write %[[R0]], {{.*}} : vector<1xf32>, memref<1024xf32> -// CHECK-PROP: vector.transfer_write %[[R1]], {{.*}} : vector<2xf32>, memref<1024xf32> - %c0 = arith.constant 0 : index - %c32 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] ->(vector<1xf32>, vector<2xf32>) { - %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32> - %3 = vector.transfer_read %src[%c32], %cst : memref<1024xf32>, vector<64xf32> - vector.yield %2, %3 : vector<32xf32>, vector<64xf32> - } - %id2 = affine.apply #map0()[%laneid] - vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32> - vector.transfer_write %r#1, %dest[%id2] : vector<2xf32>, memref<1024xf32> - return -} - -// ----- - -// CHECK-PROP-LABEL: func @fold_vector_broadcast( -// CHECK-PROP: %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) -// CHECK-PROP: %[[some_def:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[some_def]] : vector<1xf32> -// CHECK-PROP: vector.print %[[r]] : vector<1xf32> -func.func @fold_vector_broadcast(%laneid: index) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { - %0 = "some_def"() : () -> (vector<1xf32>) - %1 = vector.broadcast %0 : vector<1xf32> to vector<32xf32> - vector.yield %1 : vector<32xf32> - } - vector.print %r : vector<1xf32> - return -} - -// ----- - -// CHECK-PROP-LABEL: func @extract_vector_broadcast( -// CHECK-PROP: %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) -// CHECK-PROP: %[[some_def:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[some_def]] : vector<1xf32> -// CHECK-PROP: %[[broadcasted:.*]] = vector.broadcast %[[r]] : vector<1xf32> to vector<2xf32> -// CHECK-PROP: vector.print %[[broadcasted]] : vector<2xf32> -func.func @extract_vector_broadcast(%laneid: index) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) { - %0 = "some_def"() : () -> (vector<1xf32>) - %1 = vector.broadcast %0 : vector<1xf32> to vector<64xf32> - vector.yield %1 : vector<64xf32> - } - vector.print %r : vector<2xf32> - return -} - -// ----- - -// CHECK-PROP-LABEL: func @extract_scalar_vector_broadcast( -// CHECK-PROP: %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (f32) -// CHECK-PROP: %[[some_def:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[some_def]] : f32 -// CHECK-PROP: %[[broadcasted:.*]] = vector.broadcast %[[r]] : f32 to vector<2xf32> -// CHECK-PROP: vector.print %[[broadcasted]] : vector<2xf32> -func.func @extract_scalar_vector_broadcast(%laneid: index) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) { - %0 = "some_def"() : () -> (f32) - %1 = vector.broadcast %0 : f32 to vector<64xf32> - vector.yield %1 : vector<64xf32> - } - vector.print %r : vector<2xf32> - return -} - -// ----- - -// CHECK-PROP-LABEL: func @warp_scf_for( -// CHECK-PROP: %[[INI:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>) { -// CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: vector.yield %[[INI1]] : vector<128xf32> -// CHECK-PROP: } -// CHECK-PROP: %[[F:.*]] = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]) -> (vector<4xf32>) { -// CHECK-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]] : vector<4xf32>) -> (vector<4xf32>) { -// CHECK-PROP: ^bb0(%[[ARG:.*]]: vector<128xf32>): -// CHECK-PROP: %[[ACC:.*]] = "some_def"(%[[ARG]]) : (vector<128xf32>) -> vector<128xf32> -// CHECK-PROP: vector.yield %[[ACC]] : vector<128xf32> -// CHECK-PROP: } -// CHECK-PROP: scf.yield %[[W]] : vector<4xf32> -// CHECK-PROP: } -// CHECK-PROP: "some_use"(%[[F]]) : (vector<4xf32>) -> () -func.func @warp_scf_for(%arg0: index) { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %0 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) { - %ini = "some_def"() : () -> (vector<128xf32>) - %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) { - %acc = "some_def"(%arg4) : (vector<128xf32>) -> (vector<128xf32>) - scf.yield %acc : vector<128xf32> - } - vector.yield %3 : vector<128xf32> - } - "some_use"(%0) : (vector<4xf32>) -> () - return -} - -// ----- - -// CHECK-PROP-LABEL: func @warp_scf_for_use_from_above( -// CHECK-PROP: %[[INI:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: %[[USE:.*]] = "some_def_above"() : () -> vector<128xf32> -// CHECK-PROP: vector.yield %[[INI1]], %[[USE]] : vector<128xf32>, vector<128xf32> -// CHECK-PROP: } -// CHECK-PROP: %[[F:.*]] = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]#0) -> (vector<4xf32>) { -// CHECK-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]], %[[INI]]#1 : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>) { -// CHECK-PROP: ^bb0(%[[ARG0:.*]]: vector<128xf32>, %[[ARG1:.*]]: vector<128xf32>): -// CHECK-PROP: %[[ACC:.*]] = "some_def"(%[[ARG0]], %[[ARG1]]) : (vector<128xf32>, vector<128xf32>) -> vector<128xf32> -// CHECK-PROP: vector.yield %[[ACC]] : vector<128xf32> -// CHECK-PROP: } -// CHECK-PROP: scf.yield %[[W]] : vector<4xf32> -// CHECK-PROP: } -// CHECK-PROP: "some_use"(%[[F]]) : (vector<4xf32>) -> () -func.func @warp_scf_for_use_from_above(%arg0: index) { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %0 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) { - %ini = "some_def"() : () -> (vector<128xf32>) - %use_from_above = "some_def_above"() : () -> (vector<128xf32>) - %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) { - %acc = "some_def"(%arg4, %use_from_above) : (vector<128xf32>, vector<128xf32>) -> (vector<128xf32>) - scf.yield %acc : vector<128xf32> - } - vector.yield %3 : vector<128xf32> - } - "some_use"(%0) : (vector<4xf32>) -> () - return -} - -// ----- - -// CHECK-PROP-LABEL: func @warp_scf_for_swap( -// CHECK-PROP: %[[INI:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: %[[INI2:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: vector.yield %[[INI1]], %[[INI2]] : vector<128xf32>, vector<128xf32> -// CHECK-PROP: } -// CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG1:.*]] = %[[INI]]#0, %[[FARG2:.*]] = %[[INI]]#1) -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[W:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG1]], %[[FARG2]] : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: ^bb0(%[[ARG1:.*]]: vector<128xf32>, %[[ARG2:.*]]: vector<128xf32>): -// CHECK-PROP: %[[ACC1:.*]] = "some_def"(%[[ARG1]]) : (vector<128xf32>) -> vector<128xf32> -// CHECK-PROP: %[[ACC2:.*]] = "some_def"(%[[ARG2]]) : (vector<128xf32>) -> vector<128xf32> -// CHECK-PROP: vector.yield %[[ACC2]], %[[ACC1]] : vector<128xf32>, vector<128xf32> -// CHECK-PROP: } -// CHECK-PROP: scf.yield %[[W]]#0, %[[W]]#1 : vector<4xf32>, vector<4xf32> -// CHECK-PROP: } -// CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> () -// CHECK-PROP: "some_use"(%[[F]]#1) : (vector<4xf32>) -> () -func.func @warp_scf_for_swap(%arg0: index) { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %0:2 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>, vector<4xf32>) { - %ini1 = "some_def"() : () -> (vector<128xf32>) - %ini2 = "some_def"() : () -> (vector<128xf32>) - %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini1, %arg5 = %ini2) -> (vector<128xf32>, vector<128xf32>) { - %acc1 = "some_def"(%arg4) : (vector<128xf32>) -> (vector<128xf32>) - %acc2 = "some_def"(%arg5) : (vector<128xf32>) -> (vector<128xf32>) - scf.yield %acc2, %acc1 : vector<128xf32>, vector<128xf32> - } - vector.yield %3#0, %3#1 : vector<128xf32>, vector<128xf32> - } - "some_use"(%0#0) : (vector<4xf32>) -> () - "some_use"(%0#1) : (vector<4xf32>) -> () - return -} - -// ----- - -// CHECK-PROP-LABEL: func @warp_scf_for_swap_no_yield( -// CHECK-PROP: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { -// CHECK-PROP-NEXT: vector.warp_execute_on_lane_0(%{{.*}})[32] { -// CHECK-PROP-NEXT: "some_op"() : () -> () -// CHECK-PROP-NEXT: } -// CHECK-PROP-NEXT: } -func.func @warp_scf_for_swap_no_yield(%arg0: index) { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - vector.warp_execute_on_lane_0(%arg0)[32] { - scf.for %arg3 = %c0 to %c128 step %c1 { - "some_op"() : () -> () +func.func @foo() { + "test.foo"() ({ + builtin.module { + func.func private @test() } - } - return -} - -// ----- - -#map = affine_map<()[s0] -> (s0 * 4)> -#map1 = affine_map<()[s0] -> (s0 * 128 + 128)> -#map2 = affine_map<()[s0] -> (s0 * 4 + 128)> - -// CHECK-PROP-LABEL: func @warp_scf_for_multiple_yield( -// CHECK-PROP: vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { -// CHECK-PROP-NEXT: "some_def"() : () -> vector<32xf32> -// CHECK-PROP-NEXT: vector.yield %{{.*}} : vector<32xf32> -// CHECK-PROP-NEXT: } -// CHECK-PROP-NOT: vector.warp_execute_on_lane_0 -// CHECK-PROP: vector.transfer_read {{.*}} : memref, vector<4xf32> -// CHECK-PROP: vector.transfer_read {{.*}} : memref, vector<4xf32> -// CHECK-PROP: %{{.*}}:2 = scf.for {{.*}} -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP-NOT: vector.warp_execute_on_lane_0 -// CHECK-PROP: vector.transfer_read {{.*}} : memref, vector<4xf32> -// CHECK-PROP: vector.transfer_read {{.*}} : memref, vector<4xf32> -// CHECK-PROP: arith.addf {{.*}} : vector<4xf32> -// CHECK-PROP: arith.addf {{.*}} : vector<4xf32> -// CHECK-PROP: scf.yield {{.*}} : vector<4xf32>, vector<4xf32> -// CHECK-PROP: } -func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref, %arg2: memref) { - %c256 = arith.constant 256 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0:3 = vector.warp_execute_on_lane_0(%arg0)[32] -> - (vector<1xf32>, vector<4xf32>, vector<4xf32>) { - %def = "some_def"() : () -> (vector<32xf32>) - %r1 = vector.transfer_read %arg2[%c0], %cst {in_bounds = [true]} : memref, vector<128xf32> - %r2 = vector.transfer_read %arg2[%c128], %cst {in_bounds = [true]} : memref, vector<128xf32> - %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %r1, %arg5 = %r2) - -> (vector<128xf32>, vector<128xf32>) { - %o1 = affine.apply #map1()[%arg3] - %o2 = affine.apply #map2()[%arg3] - %4 = vector.transfer_read %arg1[%o1], %cst {in_bounds = [true]} : memref, vector<128xf32> - %5 = vector.transfer_read %arg1[%o2], %cst {in_bounds = [true]} : memref, vector<128xf32> - %6 = arith.addf %4, %arg4 : vector<128xf32> - %7 = arith.addf %5, %arg5 : vector<128xf32> - scf.yield %6, %7 : vector<128xf32>, vector<128xf32> - } - vector.yield %def, %3#0, %3#1 : vector<32xf32>, vector<128xf32>, vector<128xf32> - } - %1 = affine.apply #map()[%arg0] - vector.transfer_write %0#1, %arg2[%1] {in_bounds = [true]} : vector<4xf32>, memref - %2 = affine.apply #map2()[%arg0] - vector.transfer_write %0#2, %arg2[%2] {in_bounds = [true]} : vector<4xf32>, memref - "some_use"(%0#0) : (vector<1xf32>) -> () - return -} - -// ----- - -// CHECK-PROP-LABEL: func @vector_reduction( -// CHECK-PROP-SAME: %[[laneid:.*]]: index) -// CHECK-PROP-DAG: %[[c1:.*]] = arith.constant 1 : i32 -// CHECK-PROP-DAG: %[[c2:.*]] = arith.constant 2 : i32 -// CHECK-PROP-DAG: %[[c4:.*]] = arith.constant 4 : i32 -// CHECK-PROP-DAG: %[[c8:.*]] = arith.constant 8 : i32 -// CHECK-PROP-DAG: %[[c16:.*]] = arith.constant 16 : i32 -// CHECK-PROP-DAG: %[[c32:.*]] = arith.constant 32 : i32 -// CHECK-PROP: %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<1xf32>) { -// CHECK-PROP: vector.yield %{{.*}} : vector<32xf32> -// CHECK-PROP: } -// CHECK-PROP: %[[a:.*]] = vector.extract %[[warp_op]][0] : vector<1xf32> -// CHECK-PROP: %[[r0:.*]], %{{.*}} = gpu.shuffle xor %[[a]], %[[c1]], %[[c32]] -// CHECK-PROP: %[[a0:.*]] = arith.addf %[[a]], %[[r0]] -// CHECK-PROP: %[[r1:.*]], %{{.*}} = gpu.shuffle xor %[[a0]], %[[c2]], %[[c32]] -// CHECK-PROP: %[[a1:.*]] = arith.addf %[[a0]], %[[r1]] -// CHECK-PROP: %[[r2:.*]], %{{.*}} = gpu.shuffle xor %[[a1]], %[[c4]], %[[c32]] -// CHECK-PROP: %[[a2:.*]] = arith.addf %[[a1]], %[[r2]] -// CHECK-PROP: %[[r3:.*]], %{{.*}} = gpu.shuffle xor %[[a2]], %[[c8]], %[[c32]] -// CHECK-PROP: %[[a3:.*]] = arith.addf %[[a2]], %[[r3]] -// CHECK-PROP: %[[r4:.*]], %{{.*}} = gpu.shuffle xor %[[a3]], %[[c16]], %[[c32]] -// CHECK-PROP: %[[a4:.*]] = arith.addf %[[a3]], %[[r4]] -// CHECK-PROP: return %[[a4]] : f32 -func.func @vector_reduction(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { - %0 = "some_def"() : () -> (vector<32xf32>) - %1 = vector.reduction , %0 : vector<32xf32> into f32 - vector.yield %1 : f32 - } - return %r : f32 -} - -// ----- - -func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref) { - %c0 = arith.constant 0: index - %f0 = arith.constant 0.0: f32 - // CHECK-D: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector) { - // CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] { - // CHECK-D: vector.transfer_write %[[R]], %{{.*}}[] : vector, memref - vector.warp_execute_on_lane_0(%laneid)[32] { - %0 = vector.transfer_read %m0[%c0, %c0, %c0], %f0 {in_bounds = [true]} : memref<4x2x32xf32>, vector<32xf32> - %1 = vector.transfer_read %m1[], %f0 : memref, vector - %2 = vector.extractelement %1[] : vector - %3 = vector.reduction , %0 : vector<32xf32> into f32 - %4 = arith.addf %3, %2 : f32 - %5 = vector.broadcast %4 : f32 to vector - vector.transfer_write %5, %m1[] : vector, memref - } - return -} - -// ----- - -// CHECK-PROP-LABEL: func @vector_reduction_large( -// CHECK-PROP-SAME: %[[laneid:.*]]: index) -// CHECK-PROP-DAG: %[[c1:.*]] = arith.constant 1 : i32 -// CHECK-PROP-DAG: %[[c2:.*]] = arith.constant 2 : i32 -// CHECK-PROP-DAG: %[[c4:.*]] = arith.constant 4 : i32 -// CHECK-PROP-DAG: %[[c8:.*]] = arith.constant 8 : i32 -// CHECK-PROP-DAG: %[[c16:.*]] = arith.constant 16 : i32 -// CHECK-PROP-DAG: %[[c32:.*]] = arith.constant 32 : i32 -// CHECK-PROP: %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>) { -// CHECK-PROP: vector.yield %{{.*}} : vector<64xf32> -// CHECK-PROP: } -// CHECK-PROP: %[[a:.*]] = vector.reduction , %[[warp_op]] : vector<2xf32> into f32 -// CHECK-PROP: %[[r0:.*]], %{{.*}} = gpu.shuffle xor %[[a]], %[[c1]], %[[c32]] -// CHECK-PROP: %[[a0:.*]] = arith.addf %[[a]], %[[r0]] -// CHECK-PROP: %[[r1:.*]], %{{.*}} = gpu.shuffle xor %[[a0]], %[[c2]], %[[c32]] -// CHECK-PROP: %[[a1:.*]] = arith.addf %[[a0]], %[[r1]] -// CHECK-PROP: %[[r2:.*]], %{{.*}} = gpu.shuffle xor %[[a1]], %[[c4]], %[[c32]] -// CHECK-PROP: %[[a2:.*]] = arith.addf %[[a1]], %[[r2]] -// CHECK-PROP: %[[r3:.*]], %{{.*}} = gpu.shuffle xor %[[a2]], %[[c8]], %[[c32]] -// CHECK-PROP: %[[a3:.*]] = arith.addf %[[a2]], %[[r3]] -// CHECK-PROP: %[[r4:.*]], %{{.*}} = gpu.shuffle xor %[[a3]], %[[c16]], %[[c32]] -// CHECK-PROP: %[[a4:.*]] = arith.addf %[[a3]], %[[r4]] -// CHECK-PROP: return %[[a4]] : f32 -func.func @vector_reduction_large(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { - %0 = "some_def"() : () -> (vector<64xf32>) - %1 = vector.reduction , %0 : vector<64xf32> into f32 - vector.yield %1 : f32 - } - return %r : f32 -} - -// ----- - -// CHECK-PROP-LABEL: func @vector_reduction_acc( -// CHECK-PROP-SAME: %[[laneid:.*]]: index) -// CHECK-PROP-DAG: %[[c1:.*]] = arith.constant 1 : i32 -// CHECK-PROP-DAG: %[[c2:.*]] = arith.constant 2 : i32 -// CHECK-PROP-DAG: %[[c4:.*]] = arith.constant 4 : i32 -// CHECK-PROP-DAG: %[[c8:.*]] = arith.constant 8 : i32 -// CHECK-PROP-DAG: %[[c16:.*]] = arith.constant 16 : i32 -// CHECK-PROP-DAG: %[[c32:.*]] = arith.constant 32 : i32 -// CHECK-PROP: %[[warp_op:.*]]:2 = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>, f32) { -// CHECK-PROP: vector.yield %{{.*}}, %{{.*}} : vector<64xf32>, f32 -// CHECK-PROP: } -// CHECK-PROP: %[[a:.*]] = vector.reduction , %[[warp_op]]#0 : vector<2xf32> into f32 -// CHECK-PROP: %[[r0:.*]], %{{.*}} = gpu.shuffle xor %[[a]], %[[c1]], %[[c32]] -// CHECK-PROP: %[[a0:.*]] = arith.addf %[[a]], %[[r0]] -// CHECK-PROP: %[[r1:.*]], %{{.*}} = gpu.shuffle xor %[[a0]], %[[c2]], %[[c32]] -// CHECK-PROP: %[[a1:.*]] = arith.addf %[[a0]], %[[r1]] -// CHECK-PROP: %[[r2:.*]], %{{.*}} = gpu.shuffle xor %[[a1]], %[[c4]], %[[c32]] -// CHECK-PROP: %[[a2:.*]] = arith.addf %[[a1]], %[[r2]] -// CHECK-PROP: %[[r3:.*]], %{{.*}} = gpu.shuffle xor %[[a2]], %[[c8]], %[[c32]] -// CHECK-PROP: %[[a3:.*]] = arith.addf %[[a2]], %[[r3]] -// CHECK-PROP: %[[r4:.*]], %{{.*}} = gpu.shuffle xor %[[a3]], %[[c16]], %[[c32]] -// CHECK-PROP: %[[a4:.*]] = arith.addf %[[a3]], %[[r4]] -// CHECK-PROP: %[[a5:.*]] = arith.addf %[[a4]], %[[warp_op]]#1 -// CHECK-PROP: return %[[a5]] : f32 -func.func @vector_reduction_acc(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { - %0 = "some_def"() : () -> (vector<64xf32>) - %1 = "some_def"() : () -> (f32) - %2 = vector.reduction , %0, %1 : vector<64xf32> into f32 - vector.yield %2 : f32 - } - return %r : f32 -} - -// ----- - -// CHECK-PROP-LABEL: func @warp_duplicate_yield( -func.func @warp_duplicate_yield(%laneid: index) -> (vector<1xf32>, vector<1xf32>) { - // CHECK-PROP: %{{.*}}:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>) - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>, vector<1xf32>) { - %2 = "some_def"() : () -> (vector<32xf32>) - %3 = "some_def"() : () -> (vector<32xf32>) - %4 = arith.addf %2, %3 : vector<32xf32> - %5 = arith.addf %2, %2 : vector<32xf32> -// CHECK-PROP-NOT: arith.addf -// CHECK-PROP: vector.yield %{{.*}}, %{{.*}} : vector<32xf32>, vector<32xf32> - vector.yield %4, %5 : vector<32xf32>, vector<32xf32> - } - return %r#0, %r#1 : vector<1xf32>, vector<1xf32> -} - -// ----- - -// CHECK-PROP-LABEL: func @warp_constant( -// CHECK-PROP: %[[C:.*]] = arith.constant dense<2.000000e+00> : vector<1xf32> -// CHECK-PROP: return %[[C]] : vector<1xf32> -func.func @warp_constant(%laneid: index) -> (vector<1xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { - %cst = arith.constant dense<2.0> : vector<32xf32> - vector.yield %cst : vector<32xf32> - } - return %r : vector<1xf32> -} - -// ----- - -// CHECK-PROP-LABEL: func.func @vector_extract_simple( -// CHECK-PROP: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { -// CHECK-PROP: %[[V:.*]] = "some_def"() : () -> vector<1xf32> -// CHECK-PROP: vector.yield %[[V]] : vector<1xf32> -// CHECK-PROP: } -// CHECK-PROP: %[[E:.*]] = vector.extract %[[R]][0] : vector<1xf32> -// CHECK-PROP: return %[[E]] : f32 -func.func @vector_extract_simple(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { - %0 = "some_def"() : () -> (vector<1xf32>) - %1 = vector.extract %0[0] : vector<1xf32> - vector.yield %1 : f32 - } - return %r : f32 -} - -// ----- - -// CHECK-PROP-LABEL: func.func @vector_extractelement_0d( -// CHECK-PROP: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector) { -// CHECK-PROP: %[[V:.*]] = "some_def"() : () -> vector -// CHECK-PROP: vector.yield %[[V]] : vector -// CHECK-PROP: } -// CHECK-PROP: %[[E:.*]] = vector.extractelement %[[R]][] : vector -// CHECK-PROP: return %[[E]] : f32 -func.func @vector_extractelement_0d(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { - %0 = "some_def"() : () -> (vector) - %1 = vector.extractelement %0[] : vector - vector.yield %1 : f32 - } - return %r : f32 -} - -// ----- - -// CHECK-PROP: #[[$map:.*]] = affine_map<()[s0] -> (s0 ceildiv 3)> -// CHECK-PROP: #[[$map1:.*]] = affine_map<()[s0] -> (s0 mod 3)> -// CHECK-PROP-LABEL: func.func @vector_extractelement_1d( -// CHECK-PROP-SAME: %[[LANEID:.*]]: index, %[[POS:.*]]: index -// CHECK-PROP-DAG: %[[C32:.*]] = arith.constant 32 : i32 -// CHECK-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<3xf32>) { -// CHECK-PROP: %[[V:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[V]] : vector<96xf32> -// CHECK-PROP: } -// CHECK-PROP: %[[FROM_LANE:.*]] = affine.apply #[[$map]]()[%[[POS]]] -// CHECK-PROP: %[[DISTR_POS:.*]] = affine.apply #[[$map1]]()[%[[POS]]] -// CHECK-PROP: %[[EXTRACTED:.*]] = vector.extractelement %[[W]][%[[DISTR_POS]] : index] : vector<3xf32> -// CHECK-PROP: %[[FROM_LANE_I32:.*]] = arith.index_cast %[[FROM_LANE]] : index to i32 -// CHECK-PROP: %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle idx %[[EXTRACTED]], %[[FROM_LANE_I32]], %[[C32]] : f32 -// CHECK-PROP: return %[[SHUFFLED]] -func.func @vector_extractelement_1d(%laneid: index, %pos: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { - %0 = "some_def"() : () -> (vector<96xf32>) - %1 = vector.extractelement %0[%pos : index] : vector<96xf32> - vector.yield %1 : f32 - } - return %r : f32 -} - -// ----- - -// CHECK-PROP: func @lane_dependent_warp_propagate_read -// CHECK-PROP-SAME: %[[ID:.*]]: index -func.func @lane_dependent_warp_propagate_read( - %laneid: index, %src: memref<1x1024xf32>, %dest: memref<1x1024xf32>) { - // CHECK-PROP-DAG: %[[C0:.*]] = arith.constant 0 : index - // CHECK-PROP-NOT: vector.warp_execute_on_lane_0 - // CHECK-PROP-DAG: %[[R0:.*]] = vector.transfer_read %arg1[%[[C0]], %[[ID]]], %{{.*}} : memref<1x1024xf32>, vector<1x1xf32> - // CHECK-PROP: vector.transfer_write %[[R0]], {{.*}} : vector<1x1xf32>, memref<1x1024xf32> - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x1xf32>) { - %2 = vector.transfer_read %src[%c0, %c0], %cst : memref<1x1024xf32>, vector<1x32xf32> - vector.yield %2 : vector<1x32xf32> - } - vector.transfer_write %r, %dest[%c0, %laneid] : vector<1x1xf32>, memref<1x1024xf32> + }) : () -> () return } -// ----- - -// CHECK-PROP: func @dont_duplicate_read -func.func @dont_duplicate_read( - %laneid: index, %src: memref<1024xf32>) -> vector<1xf32> { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 -// CHECK-PROP: vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { -// CHECK-PROP-NEXT: vector.transfer_read -// CHECK-PROP-NEXT: "blocking_use" -// CHECK-PROP-NEXT: vector.yield - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { - %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32> - "blocking_use"(%2) : (vector<32xf32>) -> () - vector.yield %2 : vector<32xf32> - } - return %r : vector<1xf32> -} - -// ----- - -// CHECK-PROP: func @dedup -func.func @dedup(%laneid: index, %v0: vector<4xf32>, %v1: vector<4xf32>) - -> (vector<1xf32>, vector<1xf32>) { - - // CHECK-PROP: %[[SINGLE_RES:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) { - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] - args(%v0, %v1 : vector<4xf32>, vector<4xf32>) -> (vector<1xf32>, vector<1xf32>) { - ^bb0(%arg0: vector<128xf32>, %arg1: vector<128xf32>): - - // CHECK-PROP: %[[SINGLE_VAL:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>) -> vector<32xf32> - %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32> - - // CHECK-PROP: vector.yield %[[SINGLE_VAL]] : vector<32xf32> - vector.yield %2, %2 : vector<32xf32>, vector<32xf32> - } - - // CHECK-PROP: return %[[SINGLE_RES]], %[[SINGLE_RES]] : vector<1xf32>, vector<1xf32> - return %r#0, %r#1 : vector<1xf32>, vector<1xf32> -} - -// ----- - -// CHECK-SCF-IF: func @warp_execute_has_broadcast_semantics -func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: vector, %v1: vector<1xf32>, %v2: vector<1x1xf32>) - -> (f32, vector, vector<1xf32>, vector<1x1xf32>) { - // CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index - - // CHECK-SCF-IF: scf.if{{.*}}{ - %r:4 = vector.warp_execute_on_lane_0(%laneid)[32] - args(%s0, %v0, %v1, %v2 : f32, vector, vector<1xf32>, vector<1x1xf32>) -> (f32, vector, vector<1xf32>, vector<1x1xf32>) { - ^bb0(%bs0: f32, %bv0: vector, %bv1: vector<1xf32>, %bv2: vector<1x1xf32>): - - // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32> - // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32> - // CHECK-SCF-IF: vector.transfer_read {{.*}}[]{{.*}} : memref, vector - // CHECK-SCF-IF: memref.load {{.*}}[%[[C0]]] : memref<1xf32, 3> - // CHECK-SCF-IF: "some_def_0"(%{{.*}}) : (f32) -> f32 - // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector) -> vector - // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32> - // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<1x1xf32>) -> vector<1x1xf32> - // CHECK-SCF-IF: memref.store {{.*}}[%[[C0]]] : memref<1xf32, 3> - // CHECK-SCF-IF: vector.transfer_write {{.*}}[] : vector, memref - // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]]] {in_bounds = [true]} : vector<1xf32>, memref<1xf32, 3> - // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<1x1xf32>, memref<1x1xf32, 3> - - %rs0 = "some_def_0"(%bs0) : (f32) -> f32 - %rv0 = "some_def_1"(%bv0) : (vector) -> vector - %rv1 = "some_def_1"(%bv1) : (vector<1xf32>) -> vector<1xf32> - %rv2 = "some_def_1"(%bv2) : (vector<1x1xf32>) -> vector<1x1xf32> - - // CHECK-SCF-IF-NOT: vector.yield - vector.yield %rs0, %rv0, %rv1, %rv2 : f32, vector, vector<1xf32>, vector<1x1xf32> - } - - // CHECK-SCF-IF: gpu.barrier - // CHECK-SCF-IF: %[[RV2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32> - // CHECK-SCF-IF: %[[RV1:.*]] = vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32> - // CHECK-SCF-IF: %[[RV0:.*]] = vector.transfer_read {{.*}}[]{{.*}} : memref, vector - // CHECK-SCF-IF: %[[RS0:.*]] = memref.load {{.*}}[%[[C0]]] : memref<1xf32, 3> - // CHECK-SCF-IF: return %[[RS0]], %[[RV0]], %[[RV1]], %[[RV2]] : f32, vector, vector<1xf32>, vector<1x1xf32> - return %r#0, %r#1, %r#2, %r#3 : f32, vector, vector<1xf32>, vector<1x1xf32> -} - -// ----- -// CHECK-SCF-IF-DAG: #[[$TIMES2:.*]] = affine_map<()[s0] -> (s0 * 2)> - -// CHECK-SCF-IF: func @warp_execute_nd_distribute -// CHECK-SCF-IF-SAME: (%[[LANEID:.*]]: index -func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %v1: vector<1x2x128xf32>) - -> (vector<1x64x1xf32>, vector<1x2x128xf32>) { - // CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index - - // CHECK-SCF-IF: vector.transfer_write %{{.*}}, %{{.*}}[%[[LANEID]], %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x1xf32>, memref<32x64x1xf32, 3> - // CHECK-SCF-IF: %[[RID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]] - // CHECK-SCF-IF: vector.transfer_write %{{.*}}, %{{.*}}[%[[C0]], %[[RID]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x2x128xf32>, memref<1x64x128xf32, 3> - // CHECK-SCF-IF: gpu.barrier - - // CHECK-SCF-IF: scf.if{{.*}}{ - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] - args(%v0, %v1 : vector<1x64x1xf32>, vector<1x2x128xf32>) -> (vector<1x64x1xf32>, vector<1x2x128xf32>) { - ^bb0(%arg0: vector<32x64x1xf32>, %arg1: vector<1x64x128xf32>): - - // CHECK-SCF-IF-DAG: %[[SR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<32x64x1xf32> - // CHECK-SCF-IF-DAG: %[[SR1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x64x128xf32> - // CHECK-SCF-IF: %[[W0:.*]] = "some_def_0"(%[[SR0]]) : (vector<32x64x1xf32>) -> vector<32x64x1xf32> - // CHECK-SCF-IF: %[[W1:.*]] = "some_def_1"(%[[SR1]]) : (vector<1x64x128xf32>) -> vector<1x64x128xf32> - // CHECK-SCF-IF-DAG: vector.transfer_write %[[W0]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<32x64x1xf32>, memref<32x64x1xf32, 3> - // CHECK-SCF-IF-DAG: vector.transfer_write %[[W1]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x64x128xf32>, memref<1x64x128xf32, 3> - - %r0 = "some_def_0"(%arg0) : (vector<32x64x1xf32>) -> vector<32x64x1xf32> - %r1 = "some_def_1"(%arg1) : (vector<1x64x128xf32>) -> vector<1x64x128xf32> - - // CHECK-SCF-IF-NOT: vector.yield - vector.yield %r0, %r1 : vector<32x64x1xf32>, vector<1x64x128xf32> - } - - // CHECK-SCF-IF: gpu.barrier - // CHECK-SCF-IF: %[[WID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]] - // CHECK-SCF-IF-DAG: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[LANEID]], %[[C0]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<1x64x1xf32> - // CHECK-SCF-IF-DAG: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[WID]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x2x128xf32> - // CHECK-SCF-IF: return %[[R0]], %[[R1]] : vector<1x64x1xf32>, vector<1x2x128xf32> - return %r#0, %r#1 : vector<1x64x1xf32>, vector<1x2x128xf32> -}