diff --git a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp --- a/mlir/lib/IR/OperationSupport.cpp +++ b/mlir/lib/IR/OperationSupport.cpp @@ -721,16 +721,34 @@ ValueRange lhsOperands = lhs->getOperands(), rhsOperands = rhs->getOperands(); SmallVector<Value> lhsOperandStorage, rhsOperandStorage; if (lhs->hasTrait<mlir::OpTrait::IsCommutative>()) { - lhsOperandStorage.append(lhsOperands.begin(), lhsOperands.end()); - llvm::sort(lhsOperandStorage, [](Value a, Value b) -> bool { - return a.getAsOpaquePointer() < b.getAsOpaquePointer(); - }); - lhsOperands = lhsOperandStorage; + auto sortValues = [](ValueRange values) { + SmallVector<Value> sortedValues = llvm::to_vector(values); + llvm::sort(sortedValues, [](Value a, Value b) { + auto aArg = a.dyn_cast<BlockArgument>(); + auto bArg = b.dyn_cast<BlockArgument>(); + + // Case 1. Both `a` and `b` are `BlockArgument`s. + if (aArg && bArg) { + if (aArg.getParentBlock() == bArg.getParentBlock()) + return aArg.getArgNumber() < bArg.getArgNumber(); + return aArg.getParentBlock() < bArg.getParentBlock(); + } - rhsOperandStorage.append(rhsOperands.begin(), rhsOperands.end()); - llvm::sort(rhsOperandStorage, [](Value a, Value b) -> bool { - return a.getAsOpaquePointer() < b.getAsOpaquePointer(); - }); + // Case 2. One of then is a `BlockArgument` and other is not. Treat + // `BlockArgument` as lesser. + if (aArg && !bArg) + return true; + if (bArg && !aArg) + return false; + + // Case 3. Both are values. + return a.getAsOpaquePointer() < b.getAsOpaquePointer(); + }); + return sortedValues; + }; + lhsOperandStorage = sortValues(lhsOperands); + lhsOperands = lhsOperandStorage; + rhsOperandStorage = sortValues(rhsOperands); rhsOperands = rhsOperandStorage; } auto checkValueRangeMapping = diff --git a/mlir/lib/Transforms/CSE.cpp b/mlir/lib/Transforms/CSE.cpp --- a/mlir/lib/Transforms/CSE.cpp +++ b/mlir/lib/Transforms/CSE.cpp @@ -47,11 +47,70 @@ if (lhs == getTombstoneKey() || lhs == getEmptyKey() || rhs == getTombstoneKey() || rhs == getEmptyKey()) return false; + + // If op has no regions, operation equivalence w.r.t operands alone is + // enough. + if (lhs->getNumRegions() == 0 && rhs->getNumRegions() == 0) { + return OperationEquivalence::isEquivalentTo( + const_cast<Operation *>(lhsC), const_cast<Operation *>(rhsC), + OperationEquivalence::exactValueMatch, + OperationEquivalence::ignoreValueEquivalence, + OperationEquivalence::IgnoreLocations); + } + + // If lhs or rhs does not have a single region with a single block, they + // aren't CSEed for now. + if (lhs->getNumRegions() != 1 || rhs->getNumRegions() != 1 || + !llvm::hasSingleElement(lhs->getRegion(0)) || + !llvm::hasSingleElement(rhs->getRegion(0))) + return false; + + // Compare the two blocks. + Block &lhsBlock = lhs->getRegion(0).front(); + Block &rhsBlock = rhs->getRegion(0).front(); + + // Don't CSE if number of arguments differ. + if (lhsBlock.getNumArguments() != rhsBlock.getNumArguments()) + return false; + + // Map to store `Value`s from `lhsBlock` that are equivalent to `Value`s in + // `rhsBlock`. `Value`s from `lhsBlock` are the key. + DenseMap<Value, Value> areEquivalentValues; + for (auto bbArgs : llvm::zip(lhs->getRegion(0).getArguments(), + rhs->getRegion(0).getArguments())) { + areEquivalentValues[std::get<0>(bbArgs)] = std::get<1>(bbArgs); + } + + // Helper function to get the parent operation. + auto getParent = [](Value v) -> Operation * { + if (auto blockArg = v.dyn_cast<BlockArgument>()) + return blockArg.getParentBlock()->getParentOp(); + return v.getDefiningOp()->getParentOp(); + }; + + // Callback to compare if operands of ops in the region of `lhs` and `rhs` + // are equivalent. + auto mapOperands = [&](Value lhsValue, Value rhsValue) -> LogicalResult { + if (lhsValue == rhsValue) + return success(); + if (areEquivalentValues.lookup(lhsValue) == rhsValue) + return success(); + return failure(); + }; + + // Callback to compare if results of ops in the region of `lhs` and `rhs` + // are equivalent. + auto mapResults = [&](Value lhsResult, Value rhsResult) -> LogicalResult { + if (getParent(lhsResult) == lhs && getParent(rhsResult) == rhs) { + auto insertion = areEquivalentValues.insert({lhsResult, rhsResult}); + return success(insertion.first->second == rhsResult); + } + return success(); + }; + return OperationEquivalence::isEquivalentTo( const_cast<Operation *>(lhsC), const_cast<Operation *>(rhsC), - /*mapOperands=*/OperationEquivalence::exactValueMatch, - /*mapResults=*/OperationEquivalence::ignoreValueEquivalence, - OperationEquivalence::IgnoreLocations); + mapOperands, mapResults, OperationEquivalence::IgnoreLocations); } }; } // namespace @@ -204,7 +263,8 @@ // Don't simplify operations with nested blocks. We don't currently model // equality comparisons correctly among other things. It is also unclear // whether we would want to CSE such operations. - if (op->getNumRegions() != 0) + if (!(op->getNumRegions() == 0 || + (op->getNumRegions() == 1 && llvm::hasSingleElement(op->getRegion(0))))) return failure(); // Some simple use case of operation with memory side-effect are dealt with diff --git a/mlir/test/Dialect/SparseTensor/codegen_buffer_initialization.mlir b/mlir/test/Dialect/SparseTensor/codegen_buffer_initialization.mlir --- a/mlir/test/Dialect/SparseTensor/codegen_buffer_initialization.mlir +++ b/mlir/test/Dialect/SparseTensor/codegen_buffer_initialization.mlir @@ -17,7 +17,6 @@ // CHECK: linalg.fill ins(%[[C0]] : index) outs(%[[T4]] : memref<16xindex>) // CHECK: %[[T6:.*]] = memref.alloc() : memref<16xf64> // CHECK: %[[T7:.*]] = memref.cast %[[T6]] : memref<16xf64> to memref<?xf64> -// CHECK: linalg.fill ins(%{{.*}} : f64) outs(%[[T6]] : memref<16xf64>) // CHECK: linalg.fill ins(%[[C0]] : index) outs(%[[T1]] : memref<3xindex>) // CHECK: memref.store %[[A]], %[[T0]][%[[C0]]] : memref<1xindex> // CHECK: %[[P0:.*]] = sparse_tensor.push_back %[[T1]], %[[T3]] diff --git a/mlir/test/Transforms/cse.mlir b/mlir/test/Transforms/cse.mlir --- a/mlir/test/Transforms/cse.mlir +++ b/mlir/test/Transforms/cse.mlir @@ -322,3 +322,127 @@ %3 = arith.muli %1, %2 : i32 return %3 : i32 } + +// Check that an operation with a single region can CSE. +func.func @cse_single_block_ops(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>) + -> (tensor<?x?xf32>, tensor<?x?xf32>) { + %0 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32): + test.region_yield %arg0 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + %1 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32): + test.region_yield %arg0 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32> +} +// CHECK-LABEL: func @cse_single_block_ops +// CHECK: %[[OP:.+]] = test.cse_of_single_block_op +// CHECK-NOT: test.cse_of_single_block_op +// CHECK: return %[[OP]], %[[OP]] + +// Operations with different number of bbArgs dont CSE. +func.func @no_cse_varied_bbargs(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>) + -> (tensor<?x?xf32>, tensor<?x?xf32>) { + %0 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32, %arg1 : f32): + test.region_yield %arg0 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + %1 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32): + test.region_yield %arg0 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32> +} +// CHECK-LABEL: func @no_cse_varied_bbargs +// CHECK: %[[OP0:.+]] = test.cse_of_single_block_op +// CHECK: %[[OP1:.+]] = test.cse_of_single_block_op +// CHECK: return %[[OP0]], %[[OP1]] + +// Operations with different regions dont CSE +func.func @no_cse_region_difference_simple(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>) + -> (tensor<?x?xf32>, tensor<?x?xf32>) { + %0 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32, %arg1 : f32): + test.region_yield %arg0 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + %1 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32, %arg1 : f32): + test.region_yield %arg1 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32> +} +// CHECK-LABEL: func @no_cse_region_difference_simple +// CHECK: %[[OP0:.+]] = test.cse_of_single_block_op +// CHECK: %[[OP1:.+]] = test.cse_of_single_block_op +// CHECK: return %[[OP0]], %[[OP1]] + +// Operation with identical region with multiple statements CSE. +func.func @cse_single_block_ops_identical_bodies(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>, %c : f32, %d : i1) + -> (tensor<?x?xf32>, tensor<?x?xf32>) { + %0 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32, %arg1 : f32): + %1 = arith.divf %arg0, %arg1 : f32 + %2 = arith.remf %arg0, %c : f32 + %3 = arith.select %d, %1, %2 : f32 + test.region_yield %3 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + %1 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32, %arg1 : f32): + %1 = arith.divf %arg0, %arg1 : f32 + %2 = arith.remf %arg0, %c : f32 + %3 = arith.select %d, %1, %2 : f32 + test.region_yield %3 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32> +} +// CHECK-LABEL: func @cse_single_block_ops_identical_bodies +// CHECK: %[[OP:.+]] = test.cse_of_single_block_op +// CHECK-NOT: test.cse_of_single_block_op +// CHECK: return %[[OP]], %[[OP]] + +// Operation with non-identical regions dont CSE. +func.func @no_cse_single_block_ops_different_bodies(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>, %c : f32, %d : i1) + -> (tensor<?x?xf32>, tensor<?x?xf32>) { + %0 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32, %arg1 : f32): + %1 = arith.divf %arg0, %arg1 : f32 + %2 = arith.remf %arg0, %c : f32 + %3 = arith.select %d, %1, %2 : f32 + test.region_yield %3 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + %1 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32, %arg1 : f32): + %1 = arith.divf %arg0, %arg1 : f32 + %2 = arith.remf %arg0, %c : f32 + %3 = arith.select %d, %2, %1 : f32 + test.region_yield %3 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32> +} +// CHECK-LABEL: func @no_cse_single_block_ops_different_bodies +// CHECK: %[[OP0:.+]] = test.cse_of_single_block_op +// CHECK: %[[OP1:.+]] = test.cse_of_single_block_op +// CHECK: return %[[OP0]], %[[OP1]] + +// Account for commutative ops within regions during CSE. +func.func @cse_single_block_with_commutative_ops(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>, %c : f32) + -> (tensor<?x?xf32>, tensor<?x?xf32>) { + %0 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32, %arg1 : f32): + %1 = arith.addf %arg0, %arg1 : f32 + %2 = arith.mulf %1, %c : f32 + test.region_yield %2 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + %1 = test.cse_of_single_block_op inputs(%a, %b) { + ^bb0(%arg0 : f32, %arg1 : f32): + %1 = arith.addf %arg1, %arg0 : f32 + %2 = arith.mulf %c, %1 : f32 + test.region_yield %2 : f32 + } : tensor<?x?xf32>, tensor<?x?xf32> -> tensor<?x?xf32> + return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32> +} +// CHECK-LABEL: func @cse_single_block_with_commutative_ops +// CHECK: %[[OP:.+]] = test.cse_of_single_block_op +// CHECK-NOT: test.cse_of_single_block_op +// CHECK: return %[[OP]], %[[OP]] diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -670,8 +670,8 @@ // Produces an error value on the error path def TestInternalBranchOp : TEST_Op<"internal_br", - [DeclareOpInterfaceMethods<BranchOpInterface>, Terminator, - AttrSizedOperandSegments]> { + [DeclareOpInterfaceMethods<BranchOpInterface>, Terminator, + AttrSizedOperandSegments]> { let arguments = (ins Variadic<AnyType>:$successOperands, Variadic<AnyType>:$errorOperands); @@ -3045,4 +3045,19 @@ let regions = (region SizedRegion<1>:$body); } +//===---------------------------------------------------------------------===// +// Test CSE +//===---------------------------------------------------------------------===// + +def TestCSEOfSingleBlockOp : TEST_Op<"cse_of_single_block_op", + [SingleBlockImplicitTerminator<"RegionYieldOp">, Pure]> { + let arguments = (ins Variadic<AnyType>:$inputs); + let results = (outs Variadic<AnyType>:$outputs); + let regions = (region SizedRegion<1>:$region); + let assemblyFormat = [{ + attr-dict `inputs` `(` $inputs `)` + $region `:` type($inputs) `->` type($outputs) + }]; +} + #endif // TEST_OPS