diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp --- a/flang/lib/Lower/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP.cpp @@ -25,6 +25,7 @@ #include "flang/Semantics/openmp-directive-sets.h" #include "flang/Semantics/tools.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" using DeclareTargetCapturePair = @@ -3016,17 +3017,6 @@ if (rightHandClauseList) genOmpAtomicHintAndMemoryOrderClauses(converter, *rightHandClauseList, hint, memoryOrder); - auto atomicUpdateOp = firOpBuilder.create( - currentLocation, lhsAddr, hint, memoryOrder); - - //// Generate body of Atomic Update operation - // If an argument for the region is provided then create the block with that - // argument. Also update the symbol's address with the argument mlir value. - llvm::SmallVector varTys = {varType}; - llvm::SmallVector locs = {currentLocation}; - firOpBuilder.createBlock(&atomicUpdateOp.getRegion(), {}, varTys, locs); - mlir::Value val = - fir::getBase(atomicUpdateOp.getRegion().front().getArgument(0)); const auto *varDesignator = std::get_if>( &assignmentStmtVariable.u); @@ -3039,21 +3029,94 @@ "Array references as atomic update variable"); assert(name && name->symbol && "No symbol attached to atomic update variable"); - converter.bindSymbol(*name->symbol, val); - // Set the insert for the terminator operation to go at the end of the - // block. - mlir::Block &block = atomicUpdateOp.getRegion().back(); + if (Fortran::semantics::IsAllocatableOrPointer(name->symbol->GetUltimate())) + converter.bindSymbol(*name->symbol, lhsAddr); + + // Temporarily lower the atomic update into an ExecuteRegion Op + auto tempOp = + firOpBuilder.create(currentLocation, varType); + firOpBuilder.createBlock(&tempOp.getRegion()); + mlir::Block &block = tempOp.getRegion().back(); firOpBuilder.setInsertionPointToEnd(&block); - Fortran::lower::StatementContext stmtCtx; mlir::Value rhsExpr = fir::getBase(converter.genExprValue( *Fortran::semantics::GetExpr(assignmentStmtExpr), stmtCtx)); mlir::Value convertResult = firOpBuilder.createConvert(currentLocation, varType, rhsExpr); // Insert the terminator: YieldOp. - firOpBuilder.create(currentLocation, convertResult); - // Reset the insert point to before the terminator. + firOpBuilder.create(currentLocation, convertResult); firOpBuilder.setInsertionPointToStart(&block); + + // Hoist all the operations that are not transitively dependent on the + // variable that is going to be updated. + mlir::Value updateVar = converter.getSymbolAddress(*name->symbol); + if (auto decl = updateVar.getDefiningOp()) + updateVar = decl.getBase(); + llvm::SmallVector dependentOperands; + for (mlir::OpOperand &use : updateVar.getUses()) + dependentOperands.push_back(&use); + + llvm::SmallVector dependentOps; + while (!dependentOperands.empty()) { + mlir::OpOperand *use = dependentOperands.pop_back_val(); + mlir::Operation *op = use->getOwner(); + dependentOps.push_back(op); + for (mlir::OpResult result : op->getResults()) { + for (mlir::OpOperand &u : result.getUses()) { + dependentOperands.push_back(&u); + } + } + } + + llvm::SmallVector opsToMove; + for (mlir::Operation &op : tempOp.getRegion().getOps()) + if (llvm::find(dependentOps, &op) == dependentOps.end()) + opsToMove.push_back(&op); + + mlir::Operation *hoistPointOp = + tempOp->getParentOfType(); + if (!hoistPointOp) + hoistPointOp = tempOp.getOperation(); + for (mlir::Operation *op : opsToMove) + op->moveBefore(hoistPointOp); + + // Now create the AtomicUpdateOp using the Operations in the temporary + // SCF Execute Region Op. + firOpBuilder.setInsertionPointAfter(tempOp); + auto atomicUpdateOp = firOpBuilder.create( + currentLocation, updateVar, hint, memoryOrder); + + llvm::SmallVector varTys = {varType}; + llvm::SmallVector locs = {currentLocation}; + firOpBuilder.createBlock(&atomicUpdateOp.getRegion(), {}, varTys, locs); + mlir::Value val = + fir::getBase(atomicUpdateOp.getRegion().front().getArgument(0)); + + llvm::SmallVector ops; + for (mlir::Operation &op : tempOp.getRegion().getOps()) + ops.push_back(&op); + + // SCF Yield is converted to OMP Yield. All other operations are copied + for (mlir::Operation *op : ops) { + if (auto y = mlir::dyn_cast(op)) { + firOpBuilder.setInsertionPointToEnd(&atomicUpdateOp.getRegion().front()); + firOpBuilder.create(currentLocation, y.getResults()); + op->erase(); + } else { + op->remove(); + atomicUpdateOp.getRegion().front().push_back(op); + } + } + + // Remove the load and replace all uses of load with the block argument + for (mlir::Operation &op : atomicUpdateOp.getRegion().getOps()) { + if (auto y = mlir::dyn_cast(&op)) { + y.getRes().replaceAllUsesWith(val); + break; + } + } + + tempOp.erase(); } static void diff --git a/flang/test/Lower/OpenMP/atomic-capture.f90 b/flang/test/Lower/OpenMP/atomic-capture.f90 --- a/flang/test/Lower/OpenMP/atomic-capture.f90 +++ b/flang/test/Lower/OpenMP/atomic-capture.f90 @@ -8,11 +8,11 @@ !CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"} !CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"} +!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref !CHECK: omp.atomic.capture memory_order(release) { !CHECK: omp.atomic.read %[[X]] = %[[Y]] : !fir.ref !CHECK: omp.atomic.update %[[Y]] : !fir.ref { !CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref !CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[ARG]] : i32 !CHECK: omp.yield(%[[result]] : i32) !CHECK: } @@ -24,10 +24,10 @@ !$omp end atomic +!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref !CHECK: omp.atomic.capture hint(uncontended) { !CHECK: omp.atomic.update %[[Y]] : !fir.ref { !CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref !CHECK: %[[result:.*]] = arith.muli %[[temp]], %[[ARG]] : i32 !CHECK: omp.yield(%[[result]] : i32) !CHECK: } @@ -94,12 +94,12 @@ !CHECK: %[[loaded_A_addr:.*]] = fir.box_addr %[[loaded_A]] : (!fir.box>) -> !fir.ptr !CHECK: %[[loaded_B:.*]] = fir.load %[[B]] : !fir.ref>> !CHECK: %[[loaded_B_addr:.*]] = fir.box_addr %[[loaded_B]] : (!fir.box>) -> !fir.ptr -!CHECK: omp.atomic.capture { -!CHECK: omp.atomic.update %[[loaded_A_addr]] : !fir.ptr { -!CHECK: ^bb0(%[[ARG:.*]]: i32): !CHECK: %[[PRIVATE_LOADED_B:.*]] = fir.load %[[B]] : !fir.ref>> !CHECK: %[[PRIVATE_LOADED_B_addr:.*]] = fir.box_addr %[[PRIVATE_LOADED_B]] : (!fir.box>) -> !fir.ptr !CHECK: %[[loaded_value:.*]] = fir.load %[[PRIVATE_LOADED_B_addr]] : !fir.ptr +!CHECK: omp.atomic.capture { +!CHECK: omp.atomic.update %[[loaded_A_addr]] : !fir.ptr { +!CHECK: ^bb0(%[[ARG:.*]]: i32): !CHECK: %[[result:.*]] = arith.addi %[[ARG]], %[[loaded_value]] : i32 !CHECK: omp.yield(%[[result]] : i32) !CHECK: } diff --git a/flang/test/Lower/OpenMP/atomic-update.f90 b/flang/test/Lower/OpenMP/atomic-update.f90 --- a/flang/test/Lower/OpenMP/atomic-update.f90 +++ b/flang/test/Lower/OpenMP/atomic-update.f90 @@ -32,101 +32,106 @@ !CHECK: %{{.*}} = fir.convert %[[D_ADDR]] : (!fir.ref) -> !fir.ptr !CHECK: fir.store {{.*}} to %[[B_ADDR]] : !fir.ref> !CHECK: %[[LOADED_A:.*]] = fir.load %[[A_ADDR]] : !fir.ref> -!CHECK: omp.atomic.update %[[LOADED_A]] : !fir.ptr { -!CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: %[[LOADED_B:.*]] = fir.load %[[B_ADDR]] : !fir.ref> -!CHECK: %{{.*}} = fir.load %[[LOADED_B]] : !fir.ptr -!CHECK: %[[RESULT:.*]] = arith.addi %[[ARG]], %{{.*}} : i32 -!CHECK: omp.yield(%[[RESULT]] : i32) +!CHECK: %[[LOADED_B:.*]] = fir.load %[[B_ADDR]] : !fir.ref> +!CHECK: %[[BVAL:.*]] = fir.load %[[LOADED_B]] : !fir.ptr +!CHECK: omp.atomic.update %[[LOADED_A]] : !fir.ptr { +!CHECK: ^bb0(%[[ARG:.*]]: i32): +!CHECK: %[[RESULT:.*]] = arith.addi %[[ARG]], %[[BVAL]] : i32 +!CHECK: omp.yield(%[[RESULT]] : i32) !CHECK: } !$omp atomic update a = a + b +!CHECK: %[[CST_1:.*]] = arith.constant 1 : i32 !CHECK: omp.atomic.update %[[Y]] : !fir.ref { !CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: {{.*}} = arith.constant 1 : i32 -!CHECK: %[[RESULT:.*]] = arith.addi %[[ARG]], {{.*}} : i32 +!CHECK: %[[RESULT:.*]] = arith.addi %[[ARG]], %[[CST_1]] : i32 !CHECK: omp.yield(%[[RESULT]] : i32) !CHECK: } + !$omp atomic + y = y + 1 + +!CHECK: %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref !CHECK: omp.atomic.update %[[Z]] : !fir.ref { !CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref !CHECK: %[[RESULT:.*]] = arith.muli %[[LOADED_X]], %[[ARG]] : i32 !CHECK: omp.yield(%[[RESULT]] : i32) !CHECK: } - !$omp atomic - y = y + 1 !$omp atomic update z = x * z +!CHECK: %[[CST_1:.*]] = arith.constant 1 : i32 !CHECK: omp.atomic.update memory_order(relaxed) hint(uncontended) %[[X]] : !fir.ref { !CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: %{{.*}} = arith.constant 1 : i32 -!CHECK: %[[RESULT:.*]] = arith.subi %[[ARG]], {{.*}} : i32 +!CHECK: %[[RESULT:.*]] = arith.subi %[[ARG]], %[[CST_1]] : i32 !CHECK: omp.yield(%[[RESULT]] : i32) !CHECK: } + !$omp atomic relaxed update hint(omp_sync_hint_uncontended) + x = x - 1 + +!CHECK: %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref +!CHECK: %[[LOADED_Z:.*]] = fir.load %[[Z]] : !fir.ref !CHECK: omp.atomic.update memory_order(relaxed) %[[Y]] : !fir.ref { !CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref -!CHECK: %[[LOADED_Z:.*]] = fir.load %[[Z]] : !fir.ref !CHECK: %{{.*}} = arith.cmpi sgt, %[[LOADED_X]], %[[ARG]] : i32 !CHECK: %{{.*}} = arith.select %{{.*}}, %[[LOADED_X]], %[[ARG]] : i32 !CHECK: %{{.*}} = arith.cmpi sgt, %{{.*}}, %[[LOADED_Z]] : i32 !CHECK: %[[RESULT:.*]] = arith.select %{{.*}}, %{{.*}}, %[[LOADED_Z]] : i32 !CHECK: omp.yield(%[[RESULT]] : i32) !CHECK: } + !$omp atomic update relaxed + y = max(x, y, z) + +!CHECK: %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref !CHECK: omp.atomic.update memory_order(relaxed) hint(contended) %[[Z]] : !fir.ref { !CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref !CHECK: %[[RESULT:.*]] = arith.addi %[[ARG]], %[[LOADED_X]] : i32 !CHECK: omp.yield(%[[RESULT]] : i32) !CHECK: } - !$omp atomic relaxed update hint(omp_sync_hint_uncontended) - x = x - 1 - !$omp atomic update relaxed - y = max(x, y, z) !$omp atomic relaxed hint(omp_sync_hint_contended) z = z + x +!CHECK: %[[CST_10:.*]] = arith.constant 10 : i32 !CHECK: omp.atomic.update memory_order(release) hint(contended) %[[Z]] : !fir.ref { !CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: %{{.*}} = arith.constant 10 : i32 -!CHECK: %[[RESULT:.*]] = arith.muli {{.*}}, %[[ARG]] : i32 +!CHECK: %[[RESULT:.*]] = arith.muli %[[CST_10]], %[[ARG]] : i32 !CHECK: omp.yield(%[[RESULT]] : i32) !CHECK: } + !$omp atomic release update hint(omp_lock_hint_contended) + z = z * 10 + + +!CHECK: %[[LOADED_Z:.*]] = fir.load %[[Z]] : !fir.ref !CHECK: omp.atomic.update memory_order(release) hint(speculative) %[[X]] : !fir.ref { !CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: %[[LOADED_Z:.*]] = fir.load %[[Z]] : !fir.ref !CHECK: %[[RESULT:.*]] = arith.divsi %[[ARG]], %[[LOADED_Z]] : i32 !CHECK: omp.yield(%[[RESULT]] : i32) !CHECK: } - - !$omp atomic release update hint(omp_lock_hint_contended) - z = z * 10 !$omp atomic hint(omp_lock_hint_speculative) update release x = x / z +!CHECK: %[[CST_10:.*]] = arith.constant 10 : i32 !CHECK: omp.atomic.update memory_order(seq_cst) hint(nonspeculative) %[[Y]] : !fir.ref { !CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: %{{.*}} = arith.constant 10 : i32 -!CHECK: %[[RESULT:.*]] = arith.addi %{{.*}}, %[[ARG]] : i32 +!CHECK: %[[RESULT:.*]] = arith.addi %[[CST_10]], %[[ARG]] : i32 !CHECK: omp.yield(%[[RESULT]] : i32) !CHECK: } + !$omp atomic hint(omp_sync_hint_nonspeculative) seq_cst + y = 10 + y + +!CHECK: %[[LOADED_Y:.*]] = fir.load %[[Y]] : !fir.ref !CHECK: omp.atomic.update memory_order(seq_cst) %[[Z]] : !fir.ref { !CHECK: ^bb0(%[[ARG:.*]]: i32): -!CHECK: %[[LOADED_Y:.*]] = fir.load %[[Y]] : !fir.ref !CHECK: %[[RESULT:.*]] = arith.addi %[[LOADED_Y]], %[[ARG]] : i32 !CHECK: omp.yield(%[[RESULT]] : i32) !CHECK: } - !$omp atomic hint(omp_sync_hint_nonspeculative) seq_cst - y = 10 + y !$omp atomic seq_cst update z = y + z +!CHECK: %[[C1_VAL:.*]] = arith.constant 1 : i32 !CHECK: omp.atomic.update %[[I1]] : !fir.ref { !CHECK: ^bb0(%[[VAL:.*]]: i8): !CHECK: %[[CVT_VAL:.*]] = fir.convert %[[VAL]] : (i8) -> i32 -!CHECK: %[[C1_VAL:.*]] = arith.constant 1 : i32 !CHECK: %[[ADD_VAL:.*]] = arith.addi %[[CVT_VAL]], %[[C1_VAL]] : i32 !CHECK: %[[UPDATED_VAL:.*]] = fir.convert %[[ADD_VAL]] : (i32) -> i8 !CHECK: omp.yield(%[[UPDATED_VAL]] : i8)