This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
lib/Dialect/Bufferization/IR/
-
Dialect/
-
Bufferization/
-
IR/
-
BufferizableOpInterface.cpp
-
test/Dialect/Bufferization/Transforms/
-
Dialect/
-
Bufferization/
-
Transforms/
-
one-shot-bufferize.mlir

Differential D147790

[mlir] [bufferization] Fix dealloc errors.
AbandonedPublic

Authored by cxy-1993 on Apr 7 2023, 8:09 AM.

Download Raw Diff

Details

Reviewers

springerm
mehdi_amini
rriddle
nicolasvasilache

Summary

This patch solves problem that bufferization pass generate dealloc on inappropriate location.

Let me know if you need RFC on this patch, thanks for your time.

In fact, this patch can not pass test. If input IR is:

 mlir
#map = affine_map<(d0) -> (d0 * 5)>                                                
func.func @ternimator_use_not_deallocated(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
  %0 = scf.forall (%arg1, %arg2) in (2, 2) shared_outs(%arg3 = %arg0) -> (tensor<10x10xf32>) {
    %1 = bufferization.alloc_tensor() : tensor<5x5xf32>                            
    %2 = affine.apply #map(%arg1)                                                  
    %3 = affine.apply #map(%arg2)                                                  
    scf.forall.in_parallel {                                                       
      tensor.parallel_insert_slice %1 into %arg3[%2, %3] [5, 5] [1, 1] : tensor<5x5xf32> into tensor<10x10xf32>
    }                                                                              
  }                                                                                
  return %0 : tensor<10x10xf32>                                                    
}

when bufferize with param:

-one-shot-bufferize="allow-unknown-ops copy-before-write"

will add copy before tensor.parallel_insert_slice and cause failure. Please kindly show me how to solve this problem, thanks.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

cxy-1993 created this revision.Apr 7 2023, 8:09 AM

Herald added a project: Restricted Project. · View Herald TranscriptApr 7 2023, 8:09 AM

Herald added subscribers: bviyer, Moerafaat, zero9178 and 21 others. · View Herald Transcript

cxy-1993 requested review of this revision.Apr 7 2023, 8:09 AM

Herald added a reviewer: nicolasvasilache. · View Herald TranscriptApr 7 2023, 8:09 AM

Herald added subscribers: stephenneuendorffer, nicolasvasilache. · View Herald Transcript

Harbormaster completed remote builds in B224219: Diff 511696.Apr 7 2023, 8:25 AM

This is tricky... I think the ParallelInsertSliceOp should not bufferize to a memory write. Then, a copy would never be inserted there. I.e., ParallelInsertSliceOpInterface::bufferizesToMemoryWrite should always return false. Can you give that a try and see if it fails any tests? I haven't fully thought this through yet and may be missing something...

Can you try this patch: https://reviews.llvm.org/D148408. I ran into a similar issue today and I think this should fix it.

In D147790#4270628, @springerm wrote:

Can you try this patch: https://reviews.llvm.org/D148408. I ran into a similar issue today and I think this should fix it.

Thanks for your reply, Sorry for not replying to the message in time.
This patch have solved my problem, thanks.

As https://reviews.llvm.org/D148408 , is any op contains in a terminator region (e.g. : tensor.parallel_insert_slice ) should move user before terminator after bufferize?

cxy-1993 abandoned this revision.Apr 15 2023, 7:44 PM

Revision Contents

Path

Size

mlir/

lib/

Dialect/

Bufferization/

IR/

BufferizableOpInterface.cpp

20 lines

test/

Dialect/

Bufferization/

Transforms/

one-shot-bufferize.mlir

23 lines

Diff 511696

mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp

Show First 20 Lines • Show All 273 Lines • ▼ Show 20 Lines

bool bufferization::shouldDeallocateOpResult(		bool bufferization::shouldDeallocateOpResult(
OpResult opResult, const BufferizationOptions &options) {		OpResult opResult, const BufferizationOptions &options) {
Operation *op = opResult.getOwner();		Operation *op = opResult.getOwner();
assert(options.dynCastBufferizableOp(op).bufferizesToAllocation(opResult) &&		assert(options.dynCastBufferizableOp(op).bufferizesToAllocation(opResult) &&
"expected that op allocates");		"expected that op allocates");

AnalysisState analysisState(options);		AnalysisState analysisState(options);
		if (analysisState.isTensorYielded(opResult))
		return false;

if (op->hasAttr(BufferizationDialect::kEscapeAttrName)) {		if (op->hasAttr(BufferizationDialect::kEscapeAttrName)) {
// AllocTensorOp has one result.		// AllocTensorOp has one result.
ArrayAttr escapeAttr =		ArrayAttr escapeAttr =
op->getAttr(BufferizationDialect::kEscapeAttrName).cast<ArrayAttr>();		op->getAttr(BufferizationDialect::kEscapeAttrName).cast<ArrayAttr>();
return !escapeAttr[0].cast<BoolAttr>().getValue();		return !escapeAttr[0].cast<BoolAttr>().getValue();
}		}

// No "escape" annotation found.		// No "escape" annotation found.
if (options.createDeallocs) {		return options.createDeallocs;
// Perform an ad-hoc analysis.
return !analysisState.isTensorYielded(opResult);
}

return false;
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// OpFilter		// OpFilter
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

bool OpFilter::isOpAllowed(Operation *op) const {		bool OpFilter::isOpAllowed(Operation *op) const {
// All other ops: Allow/disallow according to filter.		// All other ops: Allow/disallow according to filter.
▲ Show 20 Lines • Show All 302 Lines • ▼ Show 20 Lines	while (!worklist.empty()) {
if (!options.dynCastBufferizableOp(op))		if (!options.dynCastBufferizableOp(op))
continue;		continue;

// We cannot analyze through ToMemrefOps, so we have to conservatively		// We cannot analyze through ToMemrefOps, so we have to conservatively
// assume that the value is yielded.		// assume that the value is yielded.
if (isa<ToMemrefOp>(op))		if (isa<ToMemrefOp>(op))
return true;		return true;

// Check if the op is returning/yielding.		// Check if the op is recursively returning/yielding.
if (isRegionReturnLike(op))		Operation *parentOp = op;
		do {
		if (isRegionReturnLike(parentOp) \|\|
		parentOp->hasTrait<OpTrait::IsTerminator>())
return true;		return true;
		} while ((parentOp = parentOp->getParentOp()));

// Add all aliasing OpResults to the worklist.		// Add all aliasing OpResults to the worklist.
// Note: In the absence of detailed analysis information (e.g., there may be		// Note: In the absence of detailed analysis information (e.g., there may be
// no function call analysis information), this `getAliasingOpResult` is		// no function call analysis information), this `getAliasingOpResult` is
// conservative and may report additional OpResults as potentially aliasing.		// conservative and may report additional OpResults as potentially aliasing.
for (AliasingOpResult alias : getAliasingOpResults(*operand))		for (AliasingOpResult alias : getAliasingOpResults(*operand))
for (OpOperand &use : alias.opResult.getUses())		for (OpOperand &use : alias.opResult.getUses())
worklist.push_back(&use);		worklist.push_back(&use);
▲ Show 20 Lines • Show All 365 Lines • Show Last 20 Lines

mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir

Show First 20 Lines • Show All 193 Lines • ▼ Show 20 Lines	func.func @read_of_alias(%t: tensor<100xf32>, %pos1: index, %pos2: index,
// CHECK-TOP-DOWN-ANALYSIS: memref.copy		// CHECK-TOP-DOWN-ANALYSIS: memref.copy
// CHECK-TOP-DOWN-ANALYSIS: memref.store %{{.*}}, %[[alloc]]		// CHECK-TOP-DOWN-ANALYSIS: memref.store %{{.*}}, %[[alloc]]
%0 = tensor.insert %f into %t[%pos1] : tensor<100xf32>		%0 = tensor.insert %f into %t[%pos1] : tensor<100xf32>
%1 = tensor.extract_slice %t[%pos2][%sz][1] : tensor<100xf32> to tensor<?xf32>		%1 = tensor.extract_slice %t[%pos2][%sz][1] : tensor<100xf32> to tensor<?xf32>
%2 = tensor.extract %1[%pos3] : tensor<?xf32>		%2 = tensor.extract %1[%pos3] : tensor<?xf32>
%3 = tensor.extract %0[%pos3] : tensor<100xf32>		%3 = tensor.extract %0[%pos3] : tensor<100xf32>
return %2, %3 : f32, f32		return %2, %3 : f32, f32
}		}

		// -----

		// CHECK-LABEL: func @ternimator_use_not_deallocated
		#map = affine_map<(d0) -> (d0 * 5)>
		func.func @ternimator_use_not_deallocated(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
		// CHECK: %[[alloc:.*]] = memref.alloc
		// CHECK: memref.copy {{.*}} %[[alloc]]
		// CHECK: scf.forall ({{.*}}) in (2, 2) {
		%0 = scf.forall (%arg1, %arg2) in (2, 2) shared_outs(%arg3 = %arg0) -> (tensor<10x10xf32>) {
		// CHECK: %[[local_alloc:.*]] = memref.alloc
		// CHECK-NOT: memref.dealloc
		// CHECK: %[[subview:.*]] = memref.subview %[[alloc]]
		// CHECK: memref.copy %[[local_alloc]], %[[subview]]
		%1 = bufferization.alloc_tensor() : tensor<5x5xf32>
		%2 = affine.apply #map(%arg1)
		%3 = affine.apply #map(%arg2)
		scf.forall.in_parallel {
		tensor.parallel_insert_slice %1 into %arg3[%2, %3] [5, 5] [1, 1] : tensor<5x5xf32> into tensor<10x10xf32>
		}
		}
		return %0 : tensor<10x10xf32>
		}