diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -380,11 +380,14 @@ auto destinationStyleOp = dyn_cast(clonedOp); if (destinationStyleOp) { for (OpOperand *outOperand : destinationStyleOp.getDpsInitOperands()) { - auto *it = llvm::find(dest, outOperand->get()); - if (it == dest.end()) - return op->emitOpError("must have \"tensor semantic\" for tiling"); - unsigned destNum = std::distance(dest.begin(), it); - outOperand->set(destBbArgs[destNum]); + // Swap tensor inits with the corresponding block argument of the + // scf.forall op. Memref inits remain as is. + if (outOperand->get().getType().isa()) { + auto *it = llvm::find(dest, outOperand->get()); + assert(it != dest.end() && "could not find destination tensor"); + unsigned destNum = std::distance(dest.begin(), it); + outOperand->set(destBbArgs[destNum]); + } } } diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir --- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir @@ -274,34 +274,3 @@ // expected-error @below {{duplicated attribute, cannot map different loops to the same processor}} transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op } - -// ----- - -func.func @tiling_buffer_semantic_op(%x: memref<32x32xf32>, %y: memref<32x32xf32>, %stream : !gpu.async.token) { - %one = arith.constant 1 : index - %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) - { - // expected-error @below {{'linalg.generic' op must have "tensor semantic" for tiling}} - // expected-note @below {{when applied to this op}} - linalg.generic - {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%x : memref<32x32xf32>) - outs(%y : memref<32x32xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } - gpu.terminator - } - return -} - -transform.sequence failures(propagate) { -^bb1(%arg0: !transform.any_op): - %matmul = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op - // expected-error @below {{transform.structured.tile_to_forall_op failed to apply}} - %forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread, #gpu.thread, #gpu.thread ] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -} diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir --- a/mlir/test/Dialect/GPU/transform-gpu.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu.mlir @@ -307,3 +307,39 @@ transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 11, 1] warp_dims = [3, 2, 1] : (!transform.any_op) -> !transform.any_op } + +// ----- + +// CHECK-LABEL: func.func @tiling_buffer_semantic_op( +// CHECK: gpu.launch {{.*}} { +// CHECK: scf.forall {{.*}} { +// CHECK: memref.subview +// CHECK: memref.subview +// CHECK: linalg.generic +// CHECK: } +// CHECK: } +func.func @tiling_buffer_semantic_op(%x: memref<32x32xf32>, %y: memref<32x32xf32>, %stream : !gpu.async.token) { + %one = arith.constant 1 : index + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) + threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + { + linalg.generic + {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"]} + ins(%x : memref<32x32xf32>) + outs(%y : memref<32x32xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } + gpu.terminator + } + return +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !transform.any_op): + %matmul = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread, #gpu.thread, #gpu.thread ] ) + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +} diff --git a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir --- a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir +++ b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir @@ -40,6 +40,53 @@ // ----- +module { + // CHECK-LABEL: func @matmul_memref( + // CHECK: scf.forall (%{{.*}}, %{{.*}}) in (10, 20) { + // CHECK: memref.subview + // CHECK: memref.subview + // CHECK: memref.subview + // CHECK: linalg.matmul + // CHECK: } {mapping = [#gpu.thread, #gpu.thread]} + func.func @matmul_memref(%A: memref, %B: memref, %C: memref) { + linalg.matmul ins(%A, %B : memref, memref) + outs(%C : memref) + return + } + + transform.sequence failures(propagate) { + ^bb1(%arg1: !transform.any_op): + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1:2 = transform.structured.tile_to_forall_op %0 num_threads [10, 20] (mapping = [ #gpu.thread, #gpu.thread ] ) + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + } +} + +// ----- + +module { + // CHECK-LABEL: func @copy_memref( + // CHECK: scf.forall (%{{.*}}, %{{.*}}) in (10, 20) { + // CHECK: memref.subview + // CHECK: memref.subview + // CHECK: linalg.copy + // CHECK: } {mapping = [#gpu.thread, #gpu.thread]} + func.func @copy_memref(%A: memref, %B: memref) { + linalg.copy ins(%A: memref) + outs(%B : memref) + return + } + + transform.sequence failures(propagate) { + ^bb1(%arg1: !transform.any_op): + %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1:2 = transform.structured.tile_to_forall_op %0 num_threads [10, 20] (mapping = [ #gpu.thread, #gpu.thread ] ) + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + } +} + +// ----- + // In this test case, matmul dims and tile size are dynamic. // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)>