diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp --- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp @@ -486,6 +486,47 @@ } }; +static void bufferizeGenerateLikeOpBody(RewriterBase &rewriter, Location loc, + Value buffer, ValueRange dynamicSizes, + Region &body) { + assert(body.hasOneBlock() && "expected body with single block"); + auto memrefType = buffer.getType().cast(); + OpBuilder::InsertionGuard g(rewriter); + + // Collect loop bounds. + int64_t rank = memrefType.getRank(); + Value zero = rewriter.create(loc, 0); + Value one = rewriter.create(loc, 1); + SmallVector lowerBounds(rank, zero); + SmallVector steps(rank, one); + SmallVector upperBounds; + int nextDynamicIndex = 0; + for (int i = 0; i < rank; i++) { + Value upperBound = memrefType.isDynamicDim(i) + ? dynamicSizes[nextDynamicIndex++] + : rewriter.create( + loc, memrefType.getDimSize(i)); + upperBounds.push_back(upperBound); + } + + // Generate tensor elements with a parallel loop that stores into + // each element of the resulting memref. We use mergeBlockBefore to "move" + // this op's body into the scf.parallel's body. + auto parallel = + rewriter.create(loc, lowerBounds, upperBounds, steps); + Block *parallelBody = parallel.getBody(); + rewriter.mergeBlockBefore(&body.front(), parallelBody->getTerminator(), + parallelBody->getArguments()); + // Replace the inlined yield op with a store op. The scf.parallel's builder + // already populated an scf.yield at the end, so we don't need to worry + // about creating that. + Operation *elementYield = parallelBody->getTerminator()->getPrevNode(); + rewriter.setInsertionPointAfter(elementYield); + rewriter.replaceOpWithNewOp( + elementYield, elementYield->getOperands()[0], buffer, + parallelBody->getArguments()); +} + /// Bufferization of tensor.generate. struct GenerateOpInterface : public BufferizableOpInterface::ExternalModel( op->getLoc(), memrefType, *tensorAlloc); - // Collect loop bounds. - int64_t rank = memrefType.getRank(); - Value zero = rewriter.create(loc, 0); - Value one = rewriter.create(loc, 1); - SmallVector lowerBounds(rank, zero); - SmallVector steps(rank, one); - SmallVector upperBounds; - int nextDynamicIndex = 0; - for (int i = 0; i < rank; i++) { - Value upperBound = - memrefType.isDynamicDim(i) - ? generateOp.getDynamicExtents()[nextDynamicIndex++] - : rewriter.create( - loc, memrefType.getDimSize(i)); - upperBounds.push_back(upperBound); - } - - // Generate tensor elements with a parallel loop that stores into - // each element of the resulting memref. We use mergeBlockBefore to "move" - // this op's body into the scf.parallel's body. - auto parallel = - rewriter.create(loc, lowerBounds, upperBounds, steps); - Block *parallelBody = parallel.getBody(); - rewriter.mergeBlockBefore(&generateOp.getBody().front(), - parallelBody->getTerminator(), - parallelBody->getArguments()); - // Replace the inlined yield op with a store op. The scf.parallel's builder - // already populated an scf.yield at the end, so we don't need to worry - // about creating that. - Operation *elementYield = parallelBody->getTerminator()->getPrevNode(); - rewriter.setInsertionPointAfter(elementYield); - rewriter.replaceOpWithNewOp( - elementYield, elementYield->getOperands()[0], buffer, - parallelBody->getArguments()); - + bufferizeGenerateLikeOpBody(rewriter, loc, buffer, + generateOp.getDynamicExtents(), + generateOp.getBody()); replaceOpWithBufferizedValues(rewriter, op, buffer); return success(); @@ -833,12 +842,31 @@ return {}; } + FailureOr + getBufferType(Operation *op, Value value, const BufferizationOptions &options, + const DenseMap &fixedTypes) const { + // Infer memory space from the source tensor. + auto padOp = cast(op); + auto maybeSrcBufferType = + bufferization::getBufferType(padOp.getSource(), options, fixedTypes); + if (failed(maybeSrcBufferType)) + return failure(); + MemRefLayoutAttrInterface layout; + return MemRefType::get(padOp.getResultType().getShape(), + padOp.getResultType().getElementType(), layout, + maybeSrcBufferType->getMemorySpace()); + } + LogicalResult bufferize(Operation *op, RewriterBase &rewriter, const BufferizationOptions &options) const { auto padOp = cast(op); Location loc = padOp.getLoc(); RankedTensorType resultType = padOp.getResultType(); RankedTensorType srcType = padOp.getSourceType(); + auto resultBufferType = + bufferization::getBufferType(padOp.getResult(), options); + if (failed(resultBufferType)) + return failure(); auto toValue = [&](OpFoldResult ofr) { if (ofr.is()) @@ -866,25 +894,34 @@ dynamicSizes.push_back(sum); } - // Create tensor::GenerateOp. - auto generateOp = - rewriter.create(loc, resultType, dynamicSizes); - // Move over "escape" attribute if present. - if (padOp->hasAttr(BufferizationDialect::kEscapeAttrName)) - generateOp->setAttr( - BufferizationDialect::kEscapeAttrName, - padOp->getAttr(BufferizationDialect::kEscapeAttrName)); - // TODO: Memory space - rewriter.inlineRegionBefore(padOp.getRegion(), generateOp.getBody(), - generateOp.getBody().begin()); + // Should the buffer be deallocated? + bool dealloc = + shouldDeallocateOpResult(padOp.getResult().cast(), options); + // Allocate a buffer for the padded result. + FailureOr tensorAlloc = + allocateTensorForShapedValue(rewriter, loc, padOp.getResult(), + /*escape=*/!dealloc, options, + /*copy=*/false); + if (failed(tensorAlloc)) + return failure(); + Value buffer = rewriter.create( + op->getLoc(), *resultBufferType, *tensorAlloc); + + // tensor::PadOp is like tensor::GenerateOp: The only difference is that + // only a part of the generated tensor is needed. For simplicity, we reuse + // the same functionality here. + bufferizeGenerateLikeOpBody(rewriter, loc, buffer, dynamicSizes, + padOp.getBodyRegion()); // Create tensor::InsertSliceOp. + Value filledBuffer = + rewriter.create(loc, buffer); SmallVector sliceSizes = getMixedSizes(rewriter, loc, padOp.getSource()); SmallVector sliceStrides(srcType.getRank(), rewriter.getIndexAttr(1)); rewriter.replaceOpWithNewOp( - padOp, padOp.getSource(), generateOp.getResult(), + padOp, padOp.getSource(), filledBuffer, /*offsets=*/padOp.getMixedLowPad(), sliceSizes, sliceStrides); return success(); diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir --- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir @@ -253,3 +253,29 @@ %1 = tensor.insert_slice %0 into %t[0][10][1] : tensor<10xf32> into tensor<10xf32> return %1 : tensor<10xf32> } + +// ----- + +// CHECK-LABEL: func @pad_memory_space( +// CHECK-SAME: %[[t:.*]]: memref> +func.func @pad_memory_space(%t: tensor, %h1: index, %f: f32, %pos: index) -> f32 +{ + // CHECK: %[[alloc_tensor:.*]] = memref.alloc{{.*}} : memref + // CHECK: memref.copy %[[t]], %[[alloc_tensor]] + %0 = bufferization.alloc_tensor() copy(%t) + {memory_space = 3 : ui64} : tensor + // CHECK: %[[padded_alloc:.*]] = memref.alloc() {{.*}} : memref<15xf32, 3> + // CHECK: scf.parallel + // CHECK: memref.store {{.*}} : memref<15xf32, 3> + // CHECK: %[[subview:.*]] = memref.subview {{.*}} : memref<15xf32, 3> to memref, 3> + // CHECK: memref.copy %[[alloc_tensor]], %[[subview]] + %1 = tensor.pad %0 low[2] high[%h1] { + ^bb0(%arg0: index): + tensor.yield %f : f32 + } : tensor to tensor<15xf32> + // CHECK: memref.load {{.*}} : memref<15xf32, 3> + %2 = tensor.extract %1[%pos] : tensor<15xf32> + // CHECK-DAG: memref.dealloc %[[alloc_tensor]] + // CHECK-DAG: memref.dealloc %[[padded_alloc]] + return %2 : f32 +}