diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -165,9 +165,9 @@ //===----------------------------------------------------------------------===// def DecomposeOp : Op { let description = [{ @@ -414,8 +414,8 @@ [DenseArrayNonNegative]>:$iterator_interchange); let results = (outs TransformHandleTypeInterface:$transformed); - let assemblyFormat = [{ - $target + let assemblyFormat = [{ + $target (`iterator_interchange` `=` $iterator_interchange^)? attr-dict `:` custom(type($target), type($transformed)) }]; @@ -479,7 +479,7 @@ TransformOpInterface, ReportTrackingListenerFailuresOpTrait]> { let description = [{ - Lower a tensor.unpack into empty + linalg.transpose + tensor.collapse_shape + + Lower a tensor.unpack into empty + linalg.transpose + tensor.collapse_shape + tensor.extract_slice. #### Return modes @@ -497,7 +497,7 @@ Transform_ConcreteOpType<"linalg.transpose">:$transpose_op, Transform_ConcreteOpType<"tensor.collapse_shape">:$collapse_shape_op, Transform_ConcreteOpType<"tensor.extract_slice">:$extract_slice_op); - let assemblyFormat = [{ + let assemblyFormat = [{ $target attr-dict `:` functional-type(operands, results) }]; @@ -665,7 +665,7 @@ let description = [{ Pack a LinalgOp by applying a data tiling transformation on the op and packing the operands according to the `packed_sizes` specification. - + Iterator dimensions are tiled in their canonical order in the op spec. Operands are packed according to the same canonical order of the op iterator dimensions. @@ -700,7 +700,7 @@ // affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d4, d5)> // M N m n // affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> - %0 = linalg.generic_representing_some_higher_d_matmul + %0 = linalg.generic_representing_some_higher_d_matmul ins(%A, %B: tensor, tensor) outs( %C: tensor) ``` @@ -727,7 +727,7 @@ DefaultValuedAttr:$static_packed_sizes); let results = (outs TransformHandleTypeInterface:$packed_op); let assemblyFormat = [{ - $target + $target `packed_sizes` `=` custom($packed_sizes, $static_packed_sizes, type($packed_sizes)) @@ -756,27 +756,27 @@ Target a Linalg op and rewrite it into packed LinalgOp form by trying to infer whether a known suboperation is embedded - Different packing strategies are applied in order, when one applies + Different packing strategies are applied in order, when one applies successfully, the transform returns: 1. Matmul packing: Try to infer a matmul operation embedded in the target op. Specifically, this looks for 2 parallel dimensions that participate in an outer-product and 1 reduction dimension. These dimensions are referred as (m, n, k) to match canonical matmul terminology. - + The packed sizes for (m, n, k) are specified by `matmul_packed_sizes` and the optional `matmul_padded_sizes_next_multiple_of`. - When an entry `matmul_packed_sizes[i]` is non-0, the corresponding + When an entry `matmul_packed_sizes[i]` is non-0, the corresponding dimension is packed by `matmul_packed_sizes[i]`. Otherwise, the dimension is merely padded to the next multiple of `matmul_padded_sizes_next_multiple_of[i]`. `matmul_padded_sizes_next_multiple_of` is optional and is expected to either be empty or of size `3`, matching the size of `matmul_packed_sizes`. - For each individual element of `matmul_packed_sizes` and + For each individual element of `matmul_packed_sizes` and `matmul_padded_sizes_next_multiple_of`, only one of them is allowed to be non-zero. - + The ordering of the packed dimensions (mm, nn, kk) is specified by the `matmul_inner_dims_order` attribute. @@ -787,7 +787,7 @@ the most minor indexing dimensions of the linalg.generic. The most minor dimensions are themselves ordered according to `inner_dims_order`. 4. An elementwise traversal of `matmul_packed_sizes` and - `matmul_padded_sizes_next_multiple_of` is performed and for each + `matmul_padded_sizes_next_multiple_of` is performed and for each dimension `d`, either pack to `matmul_packed_sizes[d]` or pad to the `matmul_padded_sizes_next_multiple_of[d]`. 5. Packing/padding is performed by the amounts determined in step 4. and @@ -815,7 +815,7 @@ [DenseArrayCount<3>]>:$static_matmul_packed_sizes, ConfinedAttr, [Attr< - Or<[DenseArrayCount<0>.predicate, + Or<[DenseArrayCount<0>.predicate, DenseArrayCount<3>.predicate]>, "with 0 or 3 elements" >]> @@ -837,7 +837,7 @@ `matmul_packed_sizes` `=` custom($matmul_packed_sizes, $static_matmul_packed_sizes, type($matmul_packed_sizes)) - (`matmul_padded_sizes_next_multiple_of` `=` + (`matmul_padded_sizes_next_multiple_of` `=` $matmul_padded_sizes_next_multiple_of^)? `matmul_inner_dims_order` `=` $matmul_inner_dims_order ) @@ -862,7 +862,7 @@ DeclareOpInterfaceMethods, ReportTrackingListenerFailuresOpTrait]> { let description = [{ - Apply a transposition to a single `tensor.pack` (resp. `tensor.unpack`) and + Apply a transposition to a single `tensor.pack` (resp. `tensor.unpack`) and update the `linalg.generic` op that consumes (resp. produces) the operation. This transform allows composing a simple `structured.pack` with additional @@ -874,7 +874,7 @@ the specified `tensor.pack` or `tensor.unpack` op. If the `target` of this op is a `tensor.pack` then a new `tensor.empty` will - be created along with transposed versions of the `tensor.pack` and the + be created along with transposed versions of the `tensor.pack` and the consuming `linalg.generic`, which is expected to be the sole consumer. If the `target` of this op is a `tensor.unpack` then the whole pack / compute @@ -894,7 +894,7 @@ This operation returns 3 handles, one to the transformed LinalgOp, one to the transformed `tensor.pack` and one to the transformed `tensor.unpack`. - The last handle for `tensor.unpack` is empty if `target_pack_or_unpack_op` + The last handle for `tensor.unpack` is empty if `target_pack_or_unpack_op` was not itself a `tensor.unpack`. }]; @@ -971,7 +971,7 @@ let builders = [ // Builder for a transform::PadOp with automatic inference of padding // value. Warning: this will set the value 0 for the inferred elemental - // type without taking the op into account and thus only work for the + // type without taking the op into account and thus only work for the // add/mul ring at the moment. // TODO: support other operations (e.g. min, max etc). OpBuilder<(ins "Value":$target, @@ -1048,7 +1048,7 @@ Hoist the tensor.pad target operation by at most the given number of loops. Optionally apply the transpose attribute to the inner dimensions. - TODO: In the future, we should consider rewriting as a tensor.pack after + TODO: In the future, we should consider rewriting as a tensor.pack after hoisting since this abstraction is now available. TODO: Maybe also return the linalg.generic transpose created at some point. @@ -1060,7 +1060,7 @@ If all the operations referred to by the `target` handle padproperly, the transform succeeds. Otherwise the transform silently fails. - The return handle points to only the subset of successfully hoisted + The return handle points to only the subset of successfully hoisted tensor.pad operations, which can be empty. }]; @@ -1073,9 +1073,9 @@ let results = (outs TransformHandleTypeInterface:$transformed); let assemblyFormat = [{ - $target - `by` $num_loops `loops` - (`,` `transpose` `by` $transpose^)? + $target + `by` $num_loops `loops` + (`,` `transpose` `by` $transpose^)? attr-dict `:` functional-type(operands, results) }]; @@ -1122,6 +1122,7 @@ DefaultValuedAttr:$use_full_tile_buffers, UnitAttr:$use_full_tiles_by_default, UnitAttr:$use_alloca, + OptionalAttr:$memory_space, OptionalAttr:$mapping, OptionalAttr:$alignment); let results = (outs TransformHandleTypeInterface:$transformed); @@ -1202,7 +1203,7 @@ let arguments = (ins TransformHandleTypeInterface:$target); let results = (outs TransformHandleTypeInterface:$result); - let assemblyFormat = + let assemblyFormat = "$target attr-dict `:`" "custom(type($target), type($result))"; @@ -1248,9 +1249,9 @@ def RewriteInDestinationPassingStyleOp : Op< Transform_Dialect, "structured.rewrite_in_destination_passing_style", - [FunctionalStyleTransformOpTrait, + [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface, - TransformOpInterface, + TransformOpInterface, TransformEachOpTrait, ReportTrackingListenerFailuresOpTrait]> { let description = [{ @@ -1260,7 +1261,7 @@ - tensor.pad - tensor.generate - tensor.from_elements - This dichotomy hints at a future interface, for now the implementation just + This dichotomy hints at a future interface, for now the implementation just switches between different implementation. #### Return modes @@ -1271,7 +1272,7 @@ The return handle points to a subset of successfully produced operations: - `tensor.pad` case, the returned handle points to the tensor.insert_slice. - `tensor.generate` case, the returned handle points to the linalg.generic. - - `tensor.from_elements` case, the returned handle points to the last + - `tensor.from_elements` case, the returned handle points to the last `tensor.insert`. }]; @@ -1483,7 +1484,7 @@ TransformHandleTypeInterface:$split_linalg_op, TransformHandleTypeInterface:$combining_linalg_op); - let assemblyFormat = + let assemblyFormat = "$target attr-dict `:`" "functional-type(operands, results)"; @@ -1990,7 +1991,7 @@ DefaultValuedOptionalAttr:$interchange); let results = (outs TransformHandleTypeInterface:$tiled_linalg_op, Variadic:$loops); - + let builders = [ OpBuilder<(ins "Value":$target, "ArrayRef":$mixedTileSizes, @@ -2057,7 +2058,7 @@ UnitAttr:$disable_transfer_permutation_map_lowering_patterns); let results = (outs TransformHandleTypeInterface:$transformed); - let assemblyFormat = + let assemblyFormat = "$target attr-dict `:`" "functional-type(operands, results)"; @@ -2279,16 +2280,16 @@ TransformOpInterface, ReportTrackingListenerFailuresOpTrait]> { let description = [{ - Hoists supported tensor subset extract/insert operation pairs out of + Hoists supported tensor subset extract/insert operation pairs out of immediately enclosing loop iteratively, if the following conditions are true: 1. The 2 ops access the same tensor subset. 2. All operands are invariant under the enclosing loop. - + The supported subset extract/insert operation pairs currently comprise: - tensor.extract_slice / tensor.insert_slice - vector.transfer_read / vector.transfer_write on tensors - + Only scf.for loops are currently supported. When applied to: @@ -2304,8 +2305,8 @@ let results = (outs); let assemblyFormat = [{ - $target - attr-dict + $target + attr-dict `:` functional-type(operands, results) }]; @@ -2328,7 +2329,7 @@ TransformEachOpTrait, TransformOpInterface]> { let description = [{ Targeted rewrite of an tensor.insert_slice to linalg.copy. - This is useful to materialize copies explicitly before bufferization and + This is useful to materialize copies explicitly before bufferization and transform them, avoiding the need to rediscover them after bufferization. If the insert_slice source is already a linalg.copy, only return the source @@ -2336,7 +2337,7 @@ #### Return modes: - The operation always succeeds and returns a handle to the relevant + The operation always succeeds and returns a handle to the relevant linalg.copy op. }]; diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -362,6 +362,13 @@ alignment = align; return *this; } + /// Memory space of promoted buffer. If `std::nullopt` do not specify memory + /// space. + std::optional memorySpace; + LinalgPromotionOptions &setMemorySpace(Attribute memorySpc) { + memorySpace = memorySpc; + return *this; + } /// Use alloca with the default allocation scheme. bool useAlloca = false; LinalgPromotionOptions &setUseAlloca(bool use) { diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1883,6 +1883,8 @@ llvm::to_vector(getUseFullTileBuffers().getAsValueRange())); if (getAlignment().has_value()) promotionOptions = promotionOptions.setAlignment(*getAlignment()); + if (getMemorySpace().has_value()) + promotionOptions = promotionOptions.setMemorySpace(*getMemorySpace()); if (getMapping().has_value()) { // The mapping should only contain an element diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp @@ -54,10 +54,16 @@ if (alignment.has_value()) alignmentAttr = b.getI64IntegerAttr(alignment.value()); + Attribute memorySpaceAttr; + if (options.memorySpace.has_value()) + memorySpaceAttr = *options.memorySpace; + // Static buffer. if (std::optional cst = getConstantIntValue(allocSize)) { auto staticBufferType = MemRefType::get(width * cst.value(), b.getIntegerType(8)); + staticBufferType = + MemRefType::Builder(staticBufferType).setMemorySpace(memorySpaceAttr); if (options.useAlloca) { return b.create(staticBufferType, ValueRange{}, alignmentAttr); @@ -69,6 +75,8 @@ // Fallback dynamic buffer. auto dynamicBufferType = MemRefType::get(ShapedType::kDynamic, b.getIntegerType(8)); + dynamicBufferType = + MemRefType::Builder(dynamicBufferType).setMemorySpace(memorySpaceAttr); Value mul = b.createOrFold( b.create(width), allocSize); if (options.useAlloca) @@ -89,6 +97,10 @@ auto zero = b.create(0); auto one = b.create(1); + Attribute memorySpaceAttr; + if (options.memorySpace.has_value()) + memorySpaceAttr = *options.memorySpace; + Value allocSize = one; for (const auto &size : llvm::enumerate(boundingSubViewSize)) allocSize = b.createOrFold(allocSize, size.value()); @@ -96,9 +108,12 @@ layout, alignment); SmallVector dynSizes(boundingSubViewSize.size(), ShapedType::kDynamic); - Value view = b.createOrFold( - MemRefType::get(dynSizes, viewType.getElementType()), buffer, zero, - boundingSubViewSize); + + auto viewMemRefType = MemRefType::get(dynSizes, viewType.getElementType()); + viewMemRefType = + MemRefType::Builder(viewMemRefType).setMemorySpace(memorySpaceAttr); + Value view = b.createOrFold(viewMemRefType, buffer, zero, + boundingSubViewSize); return view; } diff --git a/mlir/test/Dialect/Linalg/promote.mlir b/mlir/test/Dialect/Linalg/promote.mlir --- a/mlir/test/Dialect/Linalg/promote.mlir +++ b/mlir/test/Dialect/Linalg/promote.mlir @@ -275,3 +275,111 @@ %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.structured.promote %0 : (!transform.any_op) -> !transform.any_op } + +// ----- + +#map = affine_map<(d0, d1) -> (d0, d1)> + + // CHECK-LABEL: func.func @linalg_generic_update_all_function_inputs_outputs( + // CHECK-SAME: %[[VAL_0:.*]]: memref<3x4xf32, 1>, + // CHECK-SAME: %[[VAL_1:.*]]: memref<3x4xf32, 1>) -> memref<3x4xf32, 1> { +func.func @linalg_generic_update_all_function_inputs_outputs(%arg0: memref<3x4xf32, 1>, %arg1: memref<3x4xf32, 1>) -> memref<3x4xf32, 1> { + // CHECK: %[[VAL_2:.*]] = memref.alloc() {alignment = 64 : i64} : memref<3x4xf32, 1> + // CHECK: %[[VAL_3:.*]] = memref.subview %[[VAL_0]][0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + // CHECK: %[[VAL_4:.*]] = memref.subview %[[VAL_1]][0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + // CHECK: %[[VAL_5:.*]] = memref.subview %[[VAL_2]][0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + + %alloc = memref.alloc() {alignment = 64 : i64} : memref<3x4xf32, 1> + %subview = memref.subview %arg0[0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + %subview_0 = memref.subview %arg1[0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + %subview_1 = memref.subview %alloc[0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + + // CHECK: %[[VAL_6:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_7:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_8:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_9:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_10:.*]] = arith.constant 3 : index + // CHECK: %[[VAL_11:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_12:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_13:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_14:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_15:.*]] = arith.constant 3 : index + // CHECK: %[[VAL_16:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_17:.*]] = arith.constant 3 : index + // CHECK: %[[VAL_18:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_19:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_20:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_21:.*]] = arith.constant 12 : index + // CHECK: %[[VAL_22:.*]] = memref.alloc() : memref<48xi8, #gpu.address_space> + // CHECK: %[[VAL_23:.*]] = memref.view %[[VAL_22]]{{\[}}%[[VAL_18]]]{{\[}}%[[VAL_12]], %[[VAL_15]]] : memref<48xi8, #gpu.address_space> to memref> + // CHECK: %[[VAL_24:.*]] = memref.subview %[[VAL_23]][0, 0] {{\[}}%[[VAL_14]], %[[VAL_17]]] [1, 1] : memref> to memref, #gpu.address_space> + // CHECK: %[[VAL_25:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_26:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_27:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_28:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_29:.*]] = arith.constant 3 : index + // CHECK: %[[VAL_30:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_31:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_32:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_33:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_34:.*]] = arith.constant 3 : index + // CHECK: %[[VAL_35:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_36:.*]] = arith.constant 3 : index + // CHECK: %[[VAL_37:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_38:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_39:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_40:.*]] = arith.constant 12 : index + // CHECK: %[[VAL_41:.*]] = memref.alloc() : memref<48xi8, #gpu.address_space> + // CHECK: %[[VAL_42:.*]] = memref.view %[[VAL_41]]{{\[}}%[[VAL_37]]]{{\[}}%[[VAL_31]], %[[VAL_34]]] : memref<48xi8, #gpu.address_space> to memref> + // CHECK: %[[VAL_43:.*]] = memref.subview %[[VAL_42]][0, 0] {{\[}}%[[VAL_33]], %[[VAL_36]]] [1, 1] : memref> to memref, #gpu.address_space> + // CHECK: %[[VAL_44:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_45:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_46:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_47:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_48:.*]] = arith.constant 3 : index + // CHECK: %[[VAL_49:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_50:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_51:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_52:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_53:.*]] = arith.constant 3 : index + // CHECK: %[[VAL_54:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_55:.*]] = arith.constant 3 : index + // CHECK: %[[VAL_56:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_57:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_58:.*]] = arith.constant 4 : index + // CHECK: %[[VAL_59:.*]] = arith.constant 12 : index + // CHECK: %[[VAL_60:.*]] = memref.alloc() : memref<48xi8, #gpu.address_space> + // CHECK: %[[VAL_61:.*]] = memref.view %[[VAL_60]]{{\[}}%[[VAL_56]]]{{\[}}%[[VAL_50]], %[[VAL_53]]] : memref<48xi8, #gpu.address_space> to memref> + // CHECK: %[[VAL_62:.*]] = memref.subview %[[VAL_61]][0, 0] {{\[}}%[[VAL_52]], %[[VAL_55]]] [1, 1] : memref> to memref, #gpu.address_space> + // CHECK: memref.copy %[[VAL_3]], %[[VAL_24]] : memref<4x3xf32, strided<[4, 1]>, 1> to memref, #gpu.address_space> + // CHECK: memref.copy %[[VAL_4]], %[[VAL_43]] : memref<4x3xf32, strided<[4, 1]>, 1> to memref, #gpu.address_space> + // CHECK: memref.copy %[[VAL_5]], %[[VAL_62]] : memref<4x3xf32, strided<[4, 1]>, 1> to memref, #gpu.address_space> + // CHECK: linalg.generic {doc = "", indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"], library_call = ""} ins(%[[VAL_24]], %[[VAL_43]] : memref, #gpu.address_space>, memref, #gpu.address_space>) outs(%[[VAL_62]] : memref, #gpu.address_space>) { + // CHECK: ^bb0(%[[VAL_63:.*]]: f32, %[[VAL_64:.*]]: f32, %[[VAL_65:.*]]: f32): + // CHECK: %[[VAL_66:.*]] = arith.addf %[[VAL_63]], %[[VAL_64]] : f32 + // CHECK: linalg.yield %[[VAL_66]] : f32 + // CHECK: } + + + linalg.generic {doc = "", indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"], library_call = ""} ins(%subview, %subview_0 : memref<4x3xf32, strided<[4, 1]>, 1>, memref<4x3xf32, strided<[4, 1]>, 1>) outs(%subview_1 : memref<4x3xf32, strided<[4, 1]>, 1>) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %1 = arith.addf %in, %in_1 : f32 + linalg.yield %1 : f32 + } + + // CHECK: memref.copy %[[VAL_62]], %[[VAL_5]] : memref, #gpu.address_space> to memref<4x3xf32, strided<[4, 1]>, 1> + // CHECK: memref.dealloc %[[VAL_22]] : memref<48xi8, #gpu.address_space> + // CHECK: memref.dealloc %[[VAL_41]] : memref<48xi8, #gpu.address_space> + // CHECK: memref.dealloc %[[VAL_60]] : memref<48xi8, #gpu.address_space> + // CHECK: return %[[VAL_2]] : memref<3x4xf32, 1> + // CHECK: } + + return %alloc : memref<3x4xf32, 1> +} + + +transform.sequence failures(propagate) { +^bb0(%arg1: !transform.any_op): + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.promote %0 { memory_space = #gpu.address_space } : (!transform.any_op) -> !transform.any_op +}