diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -42,9 +42,11 @@ /// Creates a pass that promotes heap-based allocations to stack-based ones. /// Only buffers smaller than the provided size are promoted. +/// Dynamic shaped buffers are promoted up to the given rank. std::unique_ptr createPromoteBuffersToStackPass(unsigned maxAllocSizeInBytes = 1024, - unsigned bitwidthOfIndexType = 64); + unsigned bitwidthOfIndexType = 64, + unsigned maxRankOfTensor = 2); /// Creates a pass that converts memref function results to out-params. std::unique_ptr createBufferResultsToOutParamsPass(); diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td --- a/mlir/include/mlir/Transforms/Passes.td +++ b/mlir/include/mlir/Transforms/Passes.td @@ -213,11 +213,13 @@ let options = [ Option<"maxAllocSizeInBytes", "max-alloc-size-in-bytes", "unsigned", /*default=*/"1024", - "Define the maximum size in bytes to promote allocations to stack.">, + "Maximal size in bytes to promote allocations to stack.">, Option<"bitwidthOfIndexType", "bitwidth-of-index-type", "unsigned", /*default=*/"64", - "Define the bitwidth of the index type. Used for size estimation.">, - + "Bitwidth of the index type. Used for size estimation.">, + Option<"maxRankOfTensors", "max-rank-of-tensors", "unsigned", + /*default=*/"2", + "Maximal tensor rank to promote dynamic buffers.">, ]; } diff --git a/mlir/lib/Transforms/BufferOptimizations.cpp b/mlir/lib/Transforms/BufferOptimizations.cpp --- a/mlir/lib/Transforms/BufferOptimizations.cpp +++ b/mlir/lib/Transforms/BufferOptimizations.cpp @@ -30,10 +30,23 @@ /// transformation is only applied to small buffers since large buffers could /// exceed the stack space. static bool isSmallAlloc(Value alloc, unsigned maximumSizeInBytes, - unsigned bitwidthOfIndexType) { + unsigned bitwidthOfIndexType, + unsigned maxRankOfTensors) { auto type = alloc.getType().dyn_cast(); - if (!type || !type.hasStaticShape()) + if (!type) return false; + if (!type.hasStaticShape()) { + // Check if the dynamic shape dimension is below the allowed rank. + if (type.getRank() < maxRankOfTensors) { + // Check if all operands of an alloc are RankOp. + return llvm::all_of(alloc.getDefiningOp()->getOperands(), + [&](Value operand) { + Operation *operandOp = operand.getDefiningOp(); + return operandOp && llvm::isa(operandOp); + }); + } + return false; + } // For index types, use the provided size, as the type does not know. unsigned int bitwidth = type.getElementType().isIndex() ? bitwidthOfIndexType @@ -286,7 +299,8 @@ : BufferPlacementTransformationBase(op) {} /// Promote buffers to stack-based allocations. - void promote(unsigned maximumSize, unsigned bitwidthOfIndexType) { + void promote(unsigned maximumSize, unsigned bitwidthOfIndexType, + unsigned maxRankOfTensors) { for (BufferPlacementAllocs::AllocEntry &entry : allocs) { Value alloc = std::get<0>(entry); Operation *dealloc = std::get<1>(entry); @@ -294,8 +308,9 @@ // The transformation is done if the allocation is limited to a given // size. Furthermore, a deallocation must not be defined for this // allocation entry and a parent allocation scope must exist. - if (!isSmallAlloc(alloc, maximumSize, bitwidthOfIndexType) || dealloc || - !hasAllocationScope(alloc, aliases)) + if (!isSmallAlloc(alloc, maximumSize, bitwidthOfIndexType, + maxRankOfTensors) || + dealloc || !hasAllocationScope(alloc, aliases)) continue; Operation *startOperation = BufferPlacementAllocs::getStartOperation( @@ -303,12 +318,13 @@ // Build a new alloca that is associated with its parent // `AutomaticAllocationScope` determined during the initialization phase. OpBuilder builder(startOperation); - auto alloca = builder.create( - alloc.getLoc(), alloc.getType().cast()); + Operation *allocOp = alloc.getDefiningOp(); + Operation *alloca = builder.create( + alloc.getLoc(), alloc.getType().cast(), + allocOp->getOperands()); // Replace the original alloc by a newly created alloca. - Operation *allocOp = alloc.getDefiningOp(); - allocOp->replaceAllUsesWith(alloca.getOperation()); + allocOp->replaceAllUsesWith(alloca); allocOp->erase(); } } @@ -347,15 +363,18 @@ : PromoteBuffersToStackBase { PromoteBuffersToStackPass(unsigned maxAllocSizeInBytes, - unsigned bitwidthOfIndexType) { + unsigned bitwidthOfIndexType, + unsigned maxRankOfTensors) { this->maxAllocSizeInBytes = maxAllocSizeInBytes; this->bitwidthOfIndexType = bitwidthOfIndexType; + this->maxRankOfTensors = maxRankOfTensors; } void runOnFunction() override { // Move all allocation nodes and convert candidates into allocas. BufferPlacementPromotion optimizer(getFunction()); - optimizer.promote(this->maxAllocSizeInBytes, this->bitwidthOfIndexType); + optimizer.promote(this->maxAllocSizeInBytes, this->bitwidthOfIndexType, + this->maxRankOfTensors); } }; @@ -371,7 +390,8 @@ std::unique_ptr mlir::createPromoteBuffersToStackPass(unsigned maxAllocSizeInBytes, - unsigned bitwidthOfIndexType) { - return std::make_unique(maxAllocSizeInBytes, - bitwidthOfIndexType); + unsigned bitwidthOfIndexType, + unsigned maxRankOfTensors) { + return std::make_unique( + maxAllocSizeInBytes, bitwidthOfIndexType, maxRankOfTensors); } diff --git a/mlir/test/Transforms/promote-buffers-to-stack.mlir b/mlir/test/Transforms/promote-buffers-to-stack.mlir --- a/mlir/test/Transforms/promote-buffers-to-stack.mlir +++ b/mlir/test/Transforms/promote-buffers-to-stack.mlir @@ -1,6 +1,7 @@ // RUN: mlir-opt -promote-buffers-to-stack -split-input-file %s | FileCheck %s --check-prefix=CHECK --check-prefix DEFINDEX // RUN: mlir-opt -promote-buffers-to-stack="bitwidth-of-index-type=256 max-alloc-size-in-bytes=128" -split-input-file %s | FileCheck %s --check-prefix=CHECK --check-prefix BIGINDEX // RUN: mlir-opt -promote-buffers-to-stack="bitwidth-of-index-type=256 max-alloc-size-in-bytes=64" -split-input-file %s | FileCheck %s --check-prefix=CHECK --check-prefix LOWLIMIT +// RUN: mlir-opt -promote-buffers-to-stack="max-rank-of-tensors=3" -split-input-file %s | FileCheck %s --check-prefix=CHECK --check-prefix RANK // This file checks the behavior of PromoteBuffersToStack pass for converting // AllocOps into AllocaOps, if possible. @@ -14,8 +15,6 @@ // PromoteBuffersToStack expected behavior: It should convert %0 into an // AllocaOp. -#map0 = affine_map<(d0) -> (d0)> - // CHECK-LABEL: func @condBranch func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { cond_br %arg0, ^bb1, ^bb2 @@ -47,8 +46,6 @@ // PromoteBuffersToStack expected behavior: // Since the alloc has dynamic type, it is not converted into an alloca. -#map0 = affine_map<(d0) -> (d0)> - // CHECK-LABEL: func @condBranchDynamicType func @condBranchDynamicType( %arg0: i1, @@ -79,6 +76,78 @@ // ----- +// CHECK-LABEL: func @condBranchDynamicRanked +func @condBranchDynamicRanked( + %arg0: i1, + %tensor: tensor<*xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3 +^bb2: + %0 = rank %tensor : tensor<*xf32> + %1 = alloc(%0) : memref + br ^bb3 +^bb3: + return +} + +// CHECK-NEXT: cond_br +// CHECK: ^bb2 +// CHECK: ^bb2 +// CHECK-NEXT: %[[RANK:.*]] = rank +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca(%[[RANK]]) +// CHECK: br ^bb3 + +// ----- + +// CHECK-LABEL: func @condBranchDynamicRanked2D +func @condBranchDynamicRanked2D( + %arg0: i1, + %tensor: tensor<*xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3 +^bb2: + %0 = rank %tensor : tensor<*xf32> + %1 = alloc(%0, %0) : memref + br ^bb3 +^bb3: + return +} + +// CHECK-NEXT: cond_br +// CHECK: ^bb2 +// CHECK: ^bb2 +// CHECK-NEXT: %[[RANK:.*]] = rank +// RANK-NEXT: %[[ALLOC:.*]] = alloca(%[[RANK]], %[[RANK]]) +// DEFINDEX-NEXT: %[[ALLOC:.*]] = alloc(%[[RANK]], %[[RANK]]) +// CHECK: br ^bb3 + +// ----- + +// CHECK-LABEL: func @condBranchDynamicNoRank +func @condBranchDynamicNoRank( + %arg0: i1, + %arg1: index, + %tensor: tensor<*xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3 +^bb2: + %0 = alloc(%arg1) : memref + br ^bb3 +^bb3: + return +} + +// CHECK-NEXT: cond_br +// CHECK: ^bb2 +// CHECK: ^bb2 +// CHECK-NEXT: %[[ALLOC:.*]] = alloc +// CHECK: br ^bb3 + +// ----- + // Test Case: Existing AllocOp with no users. // PromoteBuffersToStack expected behavior: It should convert it to an // AllocaOp. @@ -102,8 +171,6 @@ // PromoteBuffersToStack expected behavior: It should convert it into an // AllocaOp. -#map0 = affine_map<(d0) -> (d0)> - // CHECK-LABEL: func @criticalEdge func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>) @@ -132,8 +199,6 @@ // bb2 // PromoteBuffersToStack expected behavior: It converts the alloc in an alloca. -#map0 = affine_map<(d0) -> (d0)> - // CHECK-LABEL: func @invCriticalEdge func @invCriticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { %0 = alloc() : memref<2xf32> @@ -161,8 +226,6 @@ // bb3 <- Initial position of the second AllocOp // PromoteBuffersToStack expected behavior: It converts the allocs into allocas. -#map0 = affine_map<(d0) -> (d0)> - // CHECK-LABEL: func @ifElse func @ifElse(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { %0 = alloc() : memref<2xf32> @@ -198,8 +261,6 @@ // bb3 // PromoteBuffersToStack expected behavior: It converts the alloc into alloca. -#map0 = affine_map<(d0) -> (d0)> - // CHECK-LABEL: func @ifElseNoUsers func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { %0 = alloc() : memref<2xf32> @@ -233,8 +294,6 @@ // PromoteBuffersToStack expected behavior: The two allocs should be converted // into allocas. -#map0 = affine_map<(d0) -> (d0)> - // CHECK-LABEL: func @ifElseNested func @ifElseNested(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { %0 = alloc() : memref<2xf32> @@ -270,8 +329,6 @@ // PromoteBuffersToStack expected behavior: It converts the two AllocOps into // allocas. -#map0 = affine_map<(d0) -> (d0)> - // CHECK-LABEL: func @redundantOperations func @redundantOperations(%arg0: memref<2xf32>) { %0 = alloc() : memref<2xf32> @@ -299,8 +356,6 @@ // PromoteBuffersToStack expected behavior: Both AllocOps are converted into // allocas. -#map0 = affine_map<(d0) -> (d0)> - // CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc func @moving_alloc_and_inserting_missing_dealloc( %cond: i1, @@ -335,8 +390,6 @@ // PromoteBuffersToStack expected behavior: The AllocOps are converted into // allocas. -#map0 = affine_map<(d0) -> (d0)> - // CHECK-LABEL: func @nested_regions_and_cond_branch func @nested_regions_and_cond_branch( %arg0: i1, @@ -373,8 +426,6 @@ // there is no conversion allowed. The second alloc is converted, since it // only remains in the scope of the function. -#map0 = affine_map<(d0) -> (d0)> - // CHECK-LABEL: func @memref_in_function_results func @memref_in_function_results( %arg0: memref<5xf32>, @@ -583,4 +634,5 @@ // DEFINDEX-NEXT: alloca() // BIGINDEX-NEXT: alloca() // LOWLIMIT-NEXT: alloc() +// RANK-NEXT: alloca() // CHECK-NEXT: return