diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td @@ -355,6 +355,25 @@ return bufferization::getMemRefType(tensorType, options); }] >, + InterfaceMethod< + /*desc=*/[{ + Return the memory space of the given tensor OpResult if specified on + this op. If not specified, return `failure`. + + This method will never be called with OpResults that do not bufferize + to a memory allocation. + }], + /*retType=*/"FailureOr", + /*methodName=*/"getMemorySpace", + /*args=*/(ins "OpResult":$opResult), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + assert(cast($_op.getOperation()) + .bufferizesToAllocation(opResult) + && "expected allocation"); + return failure(); + }] + >, ]; let extraClassDeclaration = [{ diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td @@ -47,8 +47,9 @@ another op. The optional `memory_space` attribute specifies the memory space when - bufferizing this op. If `memory_space` is not specified, the default memory - space is used during bufferization. + bufferizing this op. The memory space is inferred from `copy` if specified. + If neigher `copy` nor `memory_space` is specified, the default memory space + is used during bufferization. Both dense and sparse tensor types are supported. The result of a `bufferization.alloc_tensor` is a tensor value that can be used like any @@ -77,6 +78,12 @@ bool bufferizesToAllocation(OpResult opResult) { return true; } + FailureOr getMemorySpace(OpResult opResult) { + if (getMemorySpace().hasValue()) + return static_cast(*getMemorySpace()); + return failure(); + } + bool bufferizesToMemoryRead(OpOperand &opOperand, const AnalysisState &state); diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td @@ -347,6 +347,10 @@ "Bufferize function boundaries (experimental).">, Option<"createDeallocs", "create-deallocs", "bool", /*default=*/"true", "Specify if new allocations should be deallocated.">, + Option<"mustInferMemorySpace", "must-infer-memory-space", "bool", + /*default=*/"false", + "The memory space of an memref types must always be inferred. If " + "unset, a default memory space of 0 is used otherwise.">, ]; let constructor = "mlir::bufferization::createTensorCopyInsertionPass()"; } diff --git a/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp --- a/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp @@ -26,6 +26,11 @@ const BufferizationOptions &options) const { auto constantOp = cast(op); + // TODO: Implement memory space for this op. E.g., by adding a memory_space + // attribute to ConstantOp. + if (options.defaultMemorySpace != static_cast(0)) + return op->emitError("memory space not implemented yet"); + // Only ranked tensors are supported. if (!constantOp.getType().isa()) return failure(); @@ -150,6 +155,10 @@ return failure(); Value trueBuffer = *maybeTrueBuffer; Value falseBuffer = *maybeFalseBuffer; + BaseMemRefType trueType = trueBuffer.getType().cast(); + BaseMemRefType falseType = falseBuffer.getType().cast(); + if (trueType.getMemorySpaceAsInt() != falseType.getMemorySpaceAsInt()) + return op->emitError("inconsistent memory space on true/false operands"); // The "true" and the "false" operands must have the same type. If the // buffers have different types, they differ only in their layout map. Cast diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp --- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp @@ -43,6 +43,13 @@ constexpr const ::llvm::StringLiteral bufferization::BufferizableOpInterface::kInplaceableAttrName; +/// Return the owner of the given value. +static Operation *getOwnerOfValue(Value value) { + if (auto opResult = value.dyn_cast()) + return opResult.getDefiningOp(); + return value.cast().getOwner()->getParentOp(); +} + /// Create an AllocTensorOp for the given shaped value. If `copy` is set, the /// shaped value is copied. Otherwise, a tensor with undefined contents is /// allocated. @@ -84,10 +91,21 @@ populateDynamicDimSizes(b, loc, tensor, dynamicSizes); } + // Create AllocTensorOp. auto allocTensorOp = b.create(loc, tensorType, dynamicSizes, copy ? tensor : Value()); allocTensorOp->setAttr(BufferizationDialect::kEscapeAttrName, b.getBoolArrayAttr({escape})); + + // Add 'memory_space' attribute. Not needed if 'copy' operand is specified. + if (copy) + return allocTensorOp.getResult(); + FailureOr copyBufferType = getBufferType(tensor, options); + if (failed(copyBufferType)) + return failure(); + allocTensorOp.setMemorySpaceAttr( + b.getIntegerAttr(b.getIntegerType(64, /*isSigned=*/false), + copyBufferType->getMemorySpaceAsInt())); return allocTensorOp.getResult(); } @@ -512,16 +530,43 @@ bufferization::getBufferType(Value value, const BufferizationOptions &options) { auto tensorType = value.getType().dyn_cast(); assert(tensorType && "unexpected non-tensor type"); + Operation *op = getOwnerOfValue(value); + // ToTensorOp: Take buffer type directly from the op. if (auto toTensorOp = value.getDefiningOp()) return toTensorOp.getMemref().getType().cast(); + // If value is a bbArg of a bufferizable op: query op interface. if (auto bbArg = value.dyn_cast()) if (auto bufferizableOp = options.dynCastBufferizableOp(bbArg.getOwner()->getParentOp())) return bufferizableOp.getBufferType(bbArg, options); - return getMemRefType(tensorType, options); + // Check value is a new buffer allocation with a memory space attribute. In + // that case we can at least infer the memory space. + Optional memorySpace = None; + if (auto opResult = value.dyn_cast()) { + if (auto bufferizableOp = + options.dynCastBufferizableOp(opResult.getDefiningOp())) { + if (bufferizableOp.bufferizesToAllocation(opResult)) { + FailureOr queriedMemorySpace = + bufferizableOp.getMemorySpace(opResult); + if (!failed(queriedMemorySpace)) + memorySpace = *queriedMemorySpace; + } + } + } + + // If we still do not know the memory space, use the default memory space (if + // any). + if (!memorySpace.hasValue()) + memorySpace = options.defaultMemorySpace; + + // If we still do not know the memory space, report a failure. + if (!memorySpace.hasValue()) + return op->emitError("could not infer memory space"); + + return getMemRefType(tensorType, options, /*layout=*/{}, *memorySpace); } void bufferization::replaceOpWithBufferizedValues(RewriterBase &rewriter, diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp --- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp @@ -174,6 +174,9 @@ unsigned memorySpace; if (getMemorySpace().hasValue()) { memorySpace = *getMemorySpace(); + } else if (getCopy()) { + memorySpace = + copyBuffer.getType().cast().getMemorySpaceAsInt(); } else if (options.defaultMemorySpace.hasValue()) { memorySpace = *options.defaultMemorySpace; } else { diff --git a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp --- a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp @@ -105,6 +105,8 @@ options.allowReturnAllocs = allowReturnAllocs; options.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries; options.createDeallocs = createDeallocs; + if (mustInferMemorySpace) + options.defaultMemorySpace = None; if (failed(insertTensorCopies(getOperation(), options))) signalPassFailure(); } diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp --- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp @@ -706,6 +706,8 @@ return success(); } + // TODO: Implement getBufferType interface method and infer buffer types. + LogicalResult bufferize(Operation *op, RewriterBase &rewriter, const BufferizationOptions &options) const { auto whileOp = cast(op); diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp --- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp @@ -379,6 +379,10 @@ const BufferizationOptions &options) const { auto fromElementsOp = cast(op); + // TODO: Implement memory space for this op. + if (options.defaultMemorySpace != static_cast(0)) + return op->emitError("memory space not implemented yet"); + // Allocate a buffer for the result. Location loc = op->getLoc(); auto tensorType = fromElementsOp.getType().cast(); @@ -435,6 +439,11 @@ LogicalResult bufferize(Operation *op, RewriterBase &rewriter, const BufferizationOptions &options) const { auto generateOp = cast(op); + + // TODO: Implement memory space for this op. + if (options.defaultMemorySpace != static_cast(0)) + return op->emitError("memory space not implemented yet"); + auto tensorType = generateOp.getType().cast(); // Allocate memory. Location loc = op->getLoc(); @@ -792,7 +801,9 @@ if (failed(srcBuffer) || failed(shapeBuffer)) return failure(); auto resultTensorType = reshapeOp.getResult().getType().cast(); - auto resultMemRefType = getMemRefType(resultTensorType, options); + auto resultMemRefType = getMemRefType( + resultTensorType, options, /*layout=*/{}, + srcBuffer->getType().cast().getMemorySpaceAsInt()); replaceOpWithNewBufferizedOp( rewriter, op, resultMemRefType, *srcBuffer, *shapeBuffer); return success(); diff --git a/mlir/test/Dialect/Arithmetic/one-shot-bufferize-memory-space-invalid.mlir b/mlir/test/Dialect/Arithmetic/one-shot-bufferize-memory-space-invalid.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Arithmetic/one-shot-bufferize-memory-space-invalid.mlir @@ -0,0 +1,22 @@ +// RUN: mlir-opt %s -one-shot-bufferize="must-infer-memory-space" -split-input-file -verify-diagnostics + +func.func @inconsistent_memory_space_arith_select(%c: i1) -> tensor<10xf32> { + // Selecting tensors with different memory spaces. Such IR cannot be + // bufferized. + %0 = bufferization.alloc_tensor() {memory_space = 0 : ui64} : tensor<10xf32> + %1 = bufferization.alloc_tensor() {memory_space = 1 : ui64} : tensor<10xf32> + // expected-error @+2 {{inconsistent memory space on true/false operands}} + // expected-error @+1 {{failed to bufferize op}} + %r = arith.select %c, %0, %1 : tensor<10xf32> + func.return %r : tensor<10xf32> +} + +// ----- + +func.func @constant_memory_space(%idx: index, %v: i32) -> tensor<3xi32> { + // expected-error @+2 {{memory space not implemented yet}} + // expected-error @+1 {{failed to bufferize op}} + %cst = arith.constant dense<[5, 1000, 20]> : tensor<3xi32> + %0 = tensor.insert %v into %cst[%idx] : tensor<3xi32> + return %0 : tensor<3xi32> +} \ No newline at end of file diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-memory-space-invalid.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-memory-space-invalid.mlir --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-memory-space-invalid.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-memory-space-invalid.mlir @@ -6,3 +6,14 @@ %0 = bufferization.alloc_tensor() : tensor<10xf32> return %0 : tensor<10xf32> } + +// ----- + +func.func @memory_space_of_unknown_op() -> f32 { + %c0 = arith.constant 0 : index + // expected-error @+1 {{could not infer memory space}} + %t = "test.dummy_op"() : () -> (tensor<10xf32>) + // expected-error @+1 {{failed to bufferize op}} + %s = tensor.extract %t[%c0] : tensor<10xf32> + return %s : f32 +} diff --git a/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space-invalid.mlir b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space-invalid.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space-invalid.mlir @@ -0,0 +1,12 @@ +// RUN: mlir-opt %s -tensor-copy-insertion="must-infer-memory-space" -split-input-file -verify-diagnostics + +// An alloc is inserted but the copy is emitted. Therefore, the memory space +// should be specified on the alloc_tensor op. +func.func @memory_space_of_unknown_op() -> (tensor<10xf32>, tensor<10xf32>) { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f32 + // expected-error @+1 {{could not infer memory space}} + %t = bufferization.alloc_tensor() : tensor<10xf32> + %s = tensor.insert %cst into %t[%c0] : tensor<10xf32> + return %s, %t : tensor<10xf32>, tensor<10xf32> +} diff --git a/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space.mlir b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space.mlir @@ -0,0 +1,25 @@ +// RUN: mlir-opt %s -tensor-copy-insertion="must-infer-memory-space" -split-input-file | FileCheck %s + +// CHECK-LABEL: func @unknown_op_copy +func.func @unknown_op_copy() -> (tensor<10xf32>, tensor<10xf32>) { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f32 + // CHECK: %[[dummy:.*]] = "test.dummy_op"() : () -> tensor<10xf32> + %t = "test.dummy_op"() : () -> tensor<10xf32> + // CHECK: %[[copy:.*]] = bufferization.alloc_tensor() copy(%[[dummy]]) {bufferization.escape = [false]} : tensor<10xf32> + %s = tensor.insert %cst into %t[%c0] : tensor<10xf32> + return %s, %t : tensor<10xf32>, tensor<10xf32> +} + +// ----- + +// CHECK-LABEL: func @alloc_tensor_copy +func.func @alloc_tensor_copy() -> (tensor<10xf32>, tensor<10xf32>) { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f32 + // CHECK: bufferization.alloc_tensor() {bufferization.escape = [false], memory_space = 1 : ui64} : tensor<10xf32> + %t = bufferization.alloc_tensor() {memory_space = 1 : ui64} : tensor<10xf32> + // CHECK: bufferization.alloc_tensor() {bufferization.escape = [false], memory_space = 1 : ui64} : tensor<10xf32> + %s = tensor.insert %cst into %t[%c0] : tensor<10xf32> + return %s, %t : tensor<10xf32>, tensor<10xf32> +} diff --git a/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion.mlir b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion.mlir --- a/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion.mlir @@ -40,10 +40,10 @@ { // CHECK: bufferization.alloc_tensor() {bufferization.escape = [false]} : tensor<5xf32> // The second alloc_tensor should not have a copy operand. - // CHECK: bufferization.alloc_tensor() {bufferization.escape = [false]} : tensor<5xf32> + // CHECK: bufferization.alloc_tensor() {bufferization.escape = [false], memory_space = 0 : ui64} : tensor<5xf32> // CHECK-NO-DEALLOC: bufferization.alloc_tensor() {bufferization.escape = [true]} : tensor<5xf32> - // CHECK-NO-DEALLOC: bufferization.alloc_tensor() {bufferization.escape = [true]} : tensor<5xf32> + // CHECK-NO-DEALLOC: bufferization.alloc_tensor() {bufferization.escape = [true], memory_space = 0 : ui64} : tensor<5xf32> %0 = bufferization.alloc_tensor() : tensor<5xf32> %1 = tensor.insert %f into %0[%idx] : tensor<5xf32> return %0, %1 : tensor<5xf32>, tensor<5xf32> @@ -55,7 +55,7 @@ func.func @do_not_copy_when_overwritten(%t: tensor<5xf32>, %f: f32) -> (tensor<5xf32>, tensor<5xf32>) { - // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {bufferization.escape = [false]} : tensor<5xf32> + // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {bufferization.escape = [false], memory_space = 0 : ui64} : tensor<5xf32> // CHECK: linalg.generic {{.*}} outs(%[[alloc]] : tensor<5xf32>) %r = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>], @@ -74,7 +74,7 @@ -> (tensor<3xf32>) { %0 = tensor.extract_slice %t[0][3][1] : tensor<5xf32> to tensor<3xf32> - // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {bufferization.escape = [false]} : tensor<3xf32> + // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {bufferization.escape = [false], memory_space = 0 : ui64} : tensor<3xf32> // CHECK: linalg.generic {{.*}} outs(%[[alloc]] : tensor<3xf32>) %r = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>], diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-invalid.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-invalid.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/SCF/one-shot-bufferize-invalid.mlir @@ -0,0 +1,16 @@ +// RUN: mlir-opt %s -one-shot-bufferize -split-input-file -verify-diagnostics + +func.func @inconsistent_memory_space_scf_if(%c: i1) -> tensor<10xf32> { + // Yielding tensors with different memory spaces. Such IR cannot be + // bufferized. + %0 = bufferization.alloc_tensor() {memory_space = 0 : ui64} : tensor<10xf32> + %1 = bufferization.alloc_tensor() {memory_space = 1 : ui64} : tensor<10xf32> + // expected-error @+2 {{inconsistent memory space on then/else branches}} + // expected-error @+1 {{failed to bufferize op}} + %r = scf.if %c -> tensor<10xf32> { + scf.yield %0 : tensor<10xf32> + } else { + scf.yield %1 : tensor<10xf32> + } + func.return %r : tensor<10xf32> +} diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir --- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir @@ -632,3 +632,80 @@ } return %0 : tensor<8x8xf32> } + +// ----- + +// CHECK-LABEL: func @scf_if_memory_space +func.func @scf_if_memory_space(%c: i1, %f: f32) -> (f32, f32) +{ + %c0 = arith.constant 0 : index + // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<5xf32, 1> + %0 = bufferization.alloc_tensor() {memory_space = 1 : ui64} : tensor<5xf32> + // CHECK: scf.if %{{.*}} -> (memref<5xf32, 1>) { + %1 = scf.if %c -> tensor<5xf32> { + // CHECK: %[[cloned:.*]] = bufferization.clone %[[alloc]] + // CHECK: scf.yield %[[cloned]] + scf.yield %0 : tensor<5xf32> + } else { + // CHECK: %[[alloc2:.*]] = memref.alloc() {{.*}} : memref<5xf32, 1> + // CHECK: memref.store %{{.*}}, %[[alloc2]] + // CHECK: %[[cloned2:.*]] = bufferization.clone %[[alloc2]] + // CHECK: memref.dealloc %[[alloc2]] + // CHECK: scf.yield %[[cloned2]] + %2 = tensor.insert %f into %0[%c0] : tensor<5xf32> + scf.yield %2 : tensor<5xf32> + } + %r0 = tensor.extract %0[%c0] : tensor<5xf32> + %r1 = tensor.extract %1[%c0] : tensor<5xf32> + return %r0, %r1 : f32, f32 +} + +// ----- + +// CHECK-LABEL: func @scf_execute_region_memory_space +// CHECK: memref.alloc() {{.*}} : memref<5xf32, 1> +// CHECK: memref.store +// CHECK: memref.load +// CHECK: memref.dealloc +func.func @scf_execute_region_memory_space(%f: f32) -> f32 { + %c0 = arith.constant 0 : index + %0 = scf.execute_region -> tensor<5xf32> { + %1 = bufferization.alloc_tensor() {memory_space = 1 : ui64} : tensor<5xf32> + %2 = tensor.insert %f into %1[%c0] : tensor<5xf32> + scf.yield %2 : tensor<5xf32> + } + %r = tensor.extract %0[%c0] : tensor<5xf32> + return %r : f32 +} + +// ----- + +// Additional allocs are inserted in the loop body. We just check that all +// allocs have the correct memory space. + +// CHECK-LABEL: func @scf_for_swapping_yields_memory_space +func.func @scf_for_swapping_yields_memory_space( + %sz: index, %C : tensor<4xf32>, %lb : index, %ub : index, %step : index) + -> (f32, f32) +{ + // CHECK: memref.alloc(%{{.*}}) {{.*}} : memref + // CHECK: memref.alloc(%{{.*}}) {{.*}} : memref + %A = bufferization.alloc_tensor(%sz) {memory_space = 1 : ui64} : tensor + %B = bufferization.alloc_tensor(%sz) {memory_space = 1 : ui64} : tensor + + // CHECK: scf.for {{.*}} { + %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B) + -> (tensor, tensor) + { + // CHECK: memref.alloc(%{{.*}}) {{.*}} : memref + // CHECK: memref.alloc(%{{.*}}) {{.*}} : memref + %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor + %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor + // Yield tensors in different order. + scf.yield %ttB, %ttA : tensor, tensor + } + // CHECK: } + %f0 = tensor.extract %r0#0[%step] : tensor + %f1 = tensor.extract %r0#1[%step] : tensor + return %f0, %f1: f32, f32 +}