diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -18,9 +18,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "ScheduleOrderedAssignments.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 
 namespace hlfir {
@@ -38,12 +42,292 @@
     llvm::cl::desc("Only run ordered assignment scheduling with no codegen"),
     llvm::cl::init(false));
 
+namespace {
+/// Structure that visits an ordered assignment tree and generates code for
+/// it according to a schedule.
+class OrderedAssignmentRewriter {
+public:
+  OrderedAssignmentRewriter(fir::FirOpBuilder &builder,
+                            hlfir::OrderedAssignmentTreeOpInterface root)
+      : builder{builder}, root{root} {}
+
+  /// Generate code for the current run of the schedule.
+  void lowerRun(hlfir::Run &run) {
+    currentRun = &run;
+    walk(root);
+    currentRun = nullptr;
+    assert(constructStack.empty() && "must exit constructs after a run");
+    mapper.clear();
+  }
+
+private:
+  /// Walk the part of an order assignment tree node that needs
+  /// to be evaluated in the current run.
+  void walk(hlfir::OrderedAssignmentTreeOpInterface node);
+
+  /// Generate code when entering a given ordered assignment node.
+  void pre(hlfir::ForallOp forallOp);
+  void pre(hlfir::ForallIndexOp);
+  void pre(hlfir::ForallMaskOp);
+  void pre(hlfir::WhereOp whereOp);
+  void pre(hlfir::ElseWhereOp elseWhereOp);
+  void pre(hlfir::RegionAssignOp);
+
+  /// Generate code when leaving a given ordered assignment node.
+  void post(hlfir::ForallOp);
+  void post(hlfir::ForallMaskOp);
+
+  /// Is this an assignment to a vector subscripted entity?
+  static bool hasVectorSubscriptedLhs(hlfir::RegionAssignOp regionAssignOp);
+  /// Are they any leaf region in node that must be saved in the current run?
+  bool mustSavedRegionIn(hlfir::OrderedAssignmentTreeOpInterface node) const;
+  /// Should this node be evaluated in the current run? Saving a region in a
+  /// node does not imply the node needs to be evaluated.
+  bool
+  isRequiredInCurrentRun(hlfir::OrderedAssignmentTreeOpInterface node) const;
+
+  /// Generate a scalar value yielded by an ordered assignment tree region.
+  /// If the value was not saved in a previous run, this clone the region
+  /// code, except the final yield, at the current execution point.
+  /// If the value was saved in a previous run, this fetches the saved value
+  /// from the temporary storage and returns the value.
+  mlir::Value generateYieldedScalarValue(mlir::Region &region);
+
+  /// Generate an entity yielded by an ordered assignment tree region, and
+  /// optionally return the (uncloned) yield if there is any clean-up that
+  /// should be done after using the entity. Like, generateYieldedScalarValue,
+  /// this will return the saved value if the region was saved in a previous
+  /// run.
+  std::pair<mlir::Value, std::optional<hlfir::YieldOp>>
+  generateYieldedEntity(mlir::Region &region);
+
+  /// If \p maybeYield is present and has a clean-up, generate the clean-up
+  /// at the current insertion point (by cloning).
+  void generateCleanupIfAny(std::optional<hlfir::YieldOp> maybeYield);
+
+  fir::FirOpBuilder &builder;
+
+  /// Map containg the mapping between the original order assignment tree
+  /// operations and the operations that have been cloned in the current run.
+  /// It is reset between two runs.
+  mlir::IRMapping mapper;
+  /// Construct stack in the current run. This allows setting back the insertion
+  /// point correctly when leaving a node that requires a fir.do_loop or fir.if
+  /// operation.
+  llvm::SmallVector<mlir::Operation *> constructStack;
+  /// Root of the order assignment tree being lowered.
+  hlfir::OrderedAssignmentTreeOpInterface root;
+  /// Pointer to the current run of the schedule being lowered.
+  hlfir::Run *currentRun = nullptr;
+};
+} // namespace
+
+void OrderedAssignmentRewriter::walk(
+    hlfir::OrderedAssignmentTreeOpInterface node) {
+  if (mustSavedRegionIn(node))
+    TODO(node.getLoc(),
+         "creating temporary storage in FORALL or WHERE constructs");
+  if (isRequiredInCurrentRun(node) || mlir::isa<hlfir::ForallIndexOp>(node)) {
+    llvm::TypeSwitch<mlir::Operation *, void>(node.getOperation())
+        .Case<hlfir::ForallOp, hlfir::ForallIndexOp, hlfir::ForallMaskOp,
+              hlfir::RegionAssignOp, hlfir::WhereOp, hlfir::ElseWhereOp>(
+            [&](auto concreteOp) { pre(concreteOp); })
+        .Default([](auto) {});
+    if (auto *body = node.getSubTreeRegion()) {
+      for (mlir::Operation &op : body->getOps())
+        if (auto subNode =
+                mlir::dyn_cast<hlfir::OrderedAssignmentTreeOpInterface>(op))
+          walk(subNode);
+      llvm::TypeSwitch<mlir::Operation *, void>(node.getOperation())
+          .Case<hlfir::ForallOp, hlfir::ForallMaskOp>(
+              [&](auto concreteOp) { post(concreteOp); })
+          .Default([](auto) {});
+    }
+  }
+}
+
+void OrderedAssignmentRewriter::pre(hlfir::ForallOp forallOp) {
+  /// Create a fir.do_loop given the hlfir.forall control values.
+  mlir::Value rawLowerBound =
+      generateYieldedScalarValue(forallOp.getLbRegion());
+  mlir::Location loc = forallOp.getLoc();
+  mlir::Type idxTy = builder.getIndexType();
+  mlir::Value lb = builder.createConvert(loc, idxTy, rawLowerBound);
+  mlir::Value rawUpperBound =
+      generateYieldedScalarValue(forallOp.getUbRegion());
+  mlir::Value ub = builder.createConvert(loc, idxTy, rawUpperBound);
+  mlir::Value step;
+  if (forallOp.getStepRegion().empty()) {
+    step = builder.createIntegerConstant(loc, idxTy, 1);
+  } else {
+    step = generateYieldedScalarValue(forallOp.getStepRegion());
+    step = builder.createConvert(loc, idxTy, step);
+  }
+  auto doLoop = builder.create<fir::DoLoopOp>(loc, lb, ub, step);
+  builder.setInsertionPointToStart(doLoop.getBody());
+  mlir::Value oldIndex = forallOp.getForallIndexValue();
+  mlir::Value newIndex =
+      builder.createConvert(loc, oldIndex.getType(), doLoop.getInductionVar());
+  mapper.map(oldIndex, newIndex);
+  constructStack.push_back(doLoop);
+}
+
+void OrderedAssignmentRewriter::post(hlfir::ForallOp) {
+  assert(!constructStack.empty() && "must contain a loop");
+  builder.setInsertionPointAfter(constructStack.pop_back_val());
+}
+
+void OrderedAssignmentRewriter::pre(hlfir::ForallIndexOp forallIndexOp) {
+  mlir::Location loc = forallIndexOp.getLoc();
+  mlir::Type intTy = fir::unwrapRefType(forallIndexOp.getType());
+  mlir::Value indexVar =
+      builder.createTemporary(loc, intTy, forallIndexOp.getName());
+  mlir::Value newVal = mapper.lookupOrDefault(forallIndexOp.getIndex());
+  builder.createStoreWithConvert(loc, newVal, indexVar);
+  mapper.map(forallIndexOp, indexVar);
+}
+
+void OrderedAssignmentRewriter::pre(hlfir::ForallMaskOp forallMaskOp) {
+  mlir::Location loc = forallMaskOp.getLoc();
+  mlir::Value mask = generateYieldedScalarValue(forallMaskOp.getMaskRegion());
+  mask = builder.createConvert(loc, builder.getI1Type(), mask);
+  auto ifOp = builder.create<fir::IfOp>(loc, std::nullopt, mask, false);
+  builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+  constructStack.push_back(ifOp);
+}
+
+void OrderedAssignmentRewriter::post(hlfir::ForallMaskOp forallMaskOp) {
+  assert(!constructStack.empty() && "must contain an ifop");
+  builder.setInsertionPointAfter(constructStack.pop_back_val());
+}
+
+void OrderedAssignmentRewriter::pre(hlfir::RegionAssignOp regionAssignOp) {
+  mlir::Location loc = regionAssignOp.getLoc();
+  auto [rhs, oldRhsYield] =
+      generateYieldedEntity(regionAssignOp.getRhsRegion());
+  if (hasVectorSubscriptedLhs(regionAssignOp))
+    TODO(loc, "assignment to vector subscripted entity");
+  auto [lhs, oldLhsYield] =
+      generateYieldedEntity(regionAssignOp.getLhsRegion());
+  if (!regionAssignOp.getUserDefinedAssignment().empty())
+    TODO(loc, "user defined assignment inside FORALL or WHERE");
+  // TODO: preserve allocatable assignment aspects for forall once
+  // they are conveyed in hlfir.region_assign.
+  builder.create<hlfir::AssignOp>(loc, rhs, lhs);
+  generateCleanupIfAny(oldRhsYield);
+  generateCleanupIfAny(oldLhsYield);
+}
+
+void OrderedAssignmentRewriter::pre(hlfir::WhereOp whereOp) {
+  mlir::Location loc = whereOp.getLoc();
+  TODO(loc, "WHERE in HLFIR");
+}
+
+void OrderedAssignmentRewriter::pre(hlfir::ElseWhereOp elseWhereOp) {
+  mlir::Location loc = elseWhereOp.getLoc();
+  TODO(loc, "ELSEWHERE in HLFIR");
+}
+
+std::pair<mlir::Value, std::optional<hlfir::YieldOp>>
+OrderedAssignmentRewriter::generateYieldedEntity(mlir::Region &region) {
+  // TODO: if the region was saved, use that instead of generating code again.
+  assert(region.hasOneBlock() && "region must contain one block");
+  // Clone all operations except the final hlfir.yield.
+  mlir::Block::OpListType &ops = region.back().getOperations();
+  assert(!ops.empty() && "yield block cannot be empty");
+  auto end = ops.end();
+  for (auto opIt = ops.begin(); std::next(opIt) != end; ++opIt)
+    (void)builder.clone(*opIt, mapper);
+  auto oldYield = mlir::dyn_cast_or_null<hlfir::YieldOp>(
+      region.back().getOperations().back());
+  assert(oldYield && "region computing scalar must end with a YieldOp");
+  // Get the value for the yielded entity, it may be the result of an operation
+  // that was cloned, or it may be the same as the previous value if the yield
+  // operand was created before the ordered assignment tree.
+  mlir::Value newEntity = mapper.lookupOrDefault(oldYield.getEntity());
+  if (oldYield.getCleanup().empty())
+    return {newEntity, std::nullopt};
+  return {newEntity, oldYield};
+}
+
+mlir::Value
+OrderedAssignmentRewriter::generateYieldedScalarValue(mlir::Region &region) {
+  auto [value, maybeYield] = generateYieldedEntity(region);
+  assert(fir::isa_trivial(value.getType()) && "not a trivial scalar value");
+  generateCleanupIfAny(maybeYield);
+  return value;
+}
+
+void OrderedAssignmentRewriter::generateCleanupIfAny(
+    std::optional<hlfir::YieldOp> maybeYield) {
+  if (maybeYield.has_value())
+    if (!maybeYield->getCleanup().empty()) {
+      assert(maybeYield->getCleanup().hasOneBlock() &&
+             "region must contain one block");
+      for (auto &op : maybeYield->getCleanup().back().getOperations())
+        builder.clone(op, mapper);
+    }
+}
+
+bool OrderedAssignmentRewriter::hasVectorSubscriptedLhs(
+    hlfir::RegionAssignOp regionAssignOp) {
+  return mlir::isa<hlfir::ElementalAddrOp>(
+      regionAssignOp.getLhsRegion().back().back());
+}
+
+bool OrderedAssignmentRewriter::mustSavedRegionIn(
+    hlfir::OrderedAssignmentTreeOpInterface node) const {
+  for (auto &action : currentRun->actions)
+    if (hlfir::SaveEntity *savedEntity =
+            std::get_if<hlfir::SaveEntity>(&action))
+      if (node.getOperation() == savedEntity->yieldRegion->getParentOp())
+        return true;
+  return false;
+}
+
+bool OrderedAssignmentRewriter::isRequiredInCurrentRun(
+    hlfir::OrderedAssignmentTreeOpInterface node) const {
+  // hlfir.forall_index do not contain saved regions/assignments,
+  // but if their hlfir.forall parent was required, they are
+  // required (the forall indices needs to be mapped).
+  if (mlir::isa<hlfir::ForallIndexOp>(node))
+    return true;
+  for (auto &action : currentRun->actions)
+    if (hlfir::SaveEntity *savedEntity =
+            std::get_if<hlfir::SaveEntity>(&action)) {
+      // A SaveEntity action does not require evaluating the node that contains
+      // it, but it requires to evaluate all the parents of the nodes that
+      // contains it. For instance, an saving a bound in hlfir.forall B does not
+      // require creating the loops for B, but it requires creating the loops
+      // for any forall parent A of the forall B.
+      if (node->isProperAncestor(savedEntity->yieldRegion->getParentOp()))
+        return true;
+    } else {
+      auto assign = std::get<hlfir::RegionAssignOp>(action);
+      if (node->isAncestor(assign.getOperation()))
+        return true;
+    }
+  return false;
+}
+
+/// Lower an ordered assignment tree to fir.do_loop and hlfir.assign given
+/// a schedule.
+static void lower(hlfir::OrderedAssignmentTreeOpInterface root,
+                  mlir::PatternRewriter &rewriter, hlfir::Schedule &schedule) {
+  auto module = root->getParentOfType<mlir::ModuleOp>();
+  fir::FirOpBuilder builder(rewriter, fir::getKindMapping(module));
+  OrderedAssignmentRewriter assignmentRewriter(builder, root);
+  for (auto &run : schedule)
+    assignmentRewriter.lowerRun(run);
+}
+
 /// Shared rewrite entry point for all the ordered assignment tree root
 /// operations. It calls the scheduler and then apply the schedule.
-static mlir::LogicalResult
-rewrite(hlfir::OrderedAssignmentTreeOpInterface &root,
-        bool tryFusingAssignments, mlir::PatternRewriter &rewriter) {
-  (void)hlfir::buildEvaluationSchedule(root, tryFusingAssignments);
+static mlir::LogicalResult rewrite(hlfir::OrderedAssignmentTreeOpInterface root,
+                                   bool tryFusingAssignments,
+                                   mlir::PatternRewriter &rewriter) {
+  hlfir::Schedule schedule =
+      hlfir::buildEvaluationSchedule(root, tryFusingAssignments);
 
   LLVM_DEBUG(
       /// Debug option to print the scheduling debug info without doing
@@ -55,8 +339,9 @@
         rewriter.eraseOp(root);
         return mlir::success();
       });
-  // TODO: lower to loops according to schedule.
-  return mlir::failure();
+  lower(root, rewriter, schedule);
+  rewriter.eraseOp(root);
+  return mlir::success();
 }
 
 namespace {
diff --git a/flang/test/HLFIR/order_assignments/forall-codegen-fuse-assignments.fir b/flang/test/HLFIR/order_assignments/forall-codegen-fuse-assignments.fir
new file mode 100644
--- /dev/null
+++ b/flang/test/HLFIR/order_assignments/forall-codegen-fuse-assignments.fir
@@ -0,0 +1,51 @@
+// Test code generation of hlfir.forall when assignment fusing is enabled
+// and possible.
+// RUN: fir-opt %s --lower-hlfir-ordered-assignments=fuse-assignments=true | FileCheck %s --check-prefix=FUSE
+// RUN: fir-opt %s --lower-hlfir-ordered-assignments=fuse-assignments=false | FileCheck %s --check-prefix=NOFUSE
+
+func.func @test_assignment_fusing(%x: !fir.ref<!fir.array<10xi32>>, %y : !fir.box<!fir.array<?xi32>>) {
+  %c42 = arith.constant 42 : i32
+  hlfir.forall lb {
+    %c1 = arith.constant 1 : index
+    hlfir.yield %c1 : index
+  } ub {
+    %c10 = arith.constant 10 : index
+    hlfir.yield %c10 : index
+  }  (%i: index) {
+    hlfir.region_assign {
+      hlfir.yield %c42 : i32
+    } to {
+      %2 = hlfir.designate %x (%i)  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+      hlfir.yield %2 : !fir.ref<i32>
+    }
+    hlfir.region_assign {
+      hlfir.yield %c42 : i32
+    } to {
+      %2 = hlfir.designate %y (%i)  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+      hlfir.yield %2 : !fir.ref<i32>
+    }
+  }
+  return
+}
+// FUSE-LABEL:   func.func @test_assignment_fusing(
+// FUSE-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<10xi32>>,
+// FUSE-SAME:                                      %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>) {
+// FUSE:           %[[VAL_2:.*]] = arith.constant 42 : i32
+// FUSE:           %[[VAL_3:.*]] = arith.constant 1 : index
+// FUSE:           %[[VAL_4:.*]] = arith.constant 10 : index
+// FUSE:           %[[VAL_5:.*]] = arith.constant 1 : index
+// FUSE:           fir.do_loop %[[VAL_6:.*]] = %[[VAL_3]] to %[[VAL_4]] step %[[VAL_5]] {
+// FUSE-NEXT:         %[[VAL_7:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_6]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+// FUSE-NEXT:         hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.ref<i32>
+// FUSE-NEXT:         %[[VAL_8:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_6]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// FUSE-NEXT:         hlfir.assign %[[VAL_2]] to %[[VAL_8]] : i32, !fir.ref<i32>
+// FUSE-NEXT:      }
+// FUSE-NEXT:      return
+
+// NOFUSE-LABEL: func.func @test_assignment_fusing(
+// NOFUSE:         fir.do_loop
+// NOFUSE:           hlfir.assign
+// NOFUSE:         }
+// NOFUSE:         fir.do_loop
+// NOFUSE:           hlfir.assign
+// NOFUSE:         }
diff --git a/flang/test/HLFIR/order_assignments/forall-codegen-no-conflict.fir b/flang/test/HLFIR/order_assignments/forall-codegen-no-conflict.fir
new file mode 100644
--- /dev/null
+++ b/flang/test/HLFIR/order_assignments/forall-codegen-no-conflict.fir
@@ -0,0 +1,201 @@
+// Test code generation of hlfir.forall, hlfir.forall_index, and hlfir.forall_mask.
+// RUN: fir-opt %s --lower-hlfir-ordered-assignments | FileCheck %s
+
+func.func @test_simple(%x: !fir.ref<!fir.array<10xi32>>) {
+  hlfir.forall lb {
+    %c1 = arith.constant 1 : index
+    hlfir.yield %c1 : index
+  } ub {
+    %c10 = arith.constant 10 : index
+    hlfir.yield %c10 : index
+  }  (%i: index) {
+    hlfir.region_assign {
+      %c42 = arith.constant 42 : i32
+      hlfir.yield %c42 : i32
+    } to {
+      %2 = hlfir.designate %x (%i)  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+      hlfir.yield %2 : !fir.ref<i32>
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @test_simple(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.ref<!fir.array<10xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_4:.*]] = %[[VAL_1]] to %[[VAL_2]] step %[[VAL_3]] {
+// CHECK:             %[[VAL_5:.*]] = arith.constant 42 : i32
+// CHECK:             %[[VAL_6:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_4]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+// CHECK:             hlfir.assign %[[VAL_5]] to %[[VAL_6]] : i32, !fir.ref<i32>
+// CHECK:           }
+
+func.func @test_index(%x: !fir.ref<!fir.array<10xi32>>) {
+  hlfir.forall lb {
+    %c1 = arith.constant 1 : index
+    hlfir.yield %c1 : index
+  } ub {
+    %c10 = arith.constant 10 : index
+    hlfir.yield %c10 : index
+  }  (%arg1: i32) {
+    %i = hlfir.forall_index "i" %arg1 : (i32) -> !fir.ref<i32>
+    hlfir.region_assign {
+      %i_load = fir.load %i : !fir.ref<i32>
+      hlfir.yield %i_load : i32
+    } to {
+      %2 = hlfir.designate %x (%arg1)  : (!fir.ref<!fir.array<10xi32>>, i32) -> !fir.ref<i32>
+      hlfir.yield %2 : !fir.ref<i32>
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @test_index(
+// CHECK-SAME:                          %[[VAL_0:.*]]: !fir.ref<!fir.array<10xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i"}
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_5:.*]] = %[[VAL_2]] to %[[VAL_3]] step %[[VAL_4]] {
+// CHECK:             %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (index) -> i32
+// CHECK:             fir.store %[[VAL_6]] to %[[VAL_1]] : !fir.ref<i32>
+// CHECK:             %[[VAL_7:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_6]])  : (!fir.ref<!fir.array<10xi32>>, i32) -> !fir.ref<i32>
+// CHECK:             hlfir.assign %[[VAL_7]] to %[[VAL_8]] : i32, !fir.ref<i32>
+// CHECK:           }
+
+
+func.func @split_schedule(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: !fir.box<!fir.array<?xf32>>, %arg2: !fir.box<!fir.array<?x?xf32>>) {
+  %c11 = arith.constant 11 : i64
+  %c10 = arith.constant 10 : i64
+  %c1 = arith.constant 1 : i64
+  %0:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %1:2 = hlfir.declare %arg1 {uniq_name = "y"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %2:2 = hlfir.declare %arg2 {uniq_name = "z"} : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+  hlfir.forall lb {
+    hlfir.yield %c1 : i64
+  } ub {
+    hlfir.yield %c10 : i64
+  }  (%arg3: i64) {
+    hlfir.region_assign {
+      %3 = hlfir.designate %1#0 (%arg3)  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+      %4 = fir.load %3 : !fir.ref<f32>
+      hlfir.yield %4 : f32
+    } to {
+      %3 = hlfir.designate %0#0 (%arg3)  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+      hlfir.yield %3 : !fir.ref<f32>
+    }
+    hlfir.forall lb {
+      hlfir.yield %c1 : i64
+    } ub {
+      hlfir.yield %c10 : i64
+    }  (%arg4: i64) {
+      hlfir.region_assign {
+        %3 = arith.subi %c11, %arg3 : i64
+        %4 = hlfir.designate %0#0 (%3)  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+        %5 = fir.load %4 : !fir.ref<f32>
+        hlfir.yield %5 : f32
+      } to {
+        %3 = hlfir.designate %2#0 (%arg3, %arg4)  : (!fir.box<!fir.array<?x?xf32>>, i64, i64) -> !fir.ref<f32>
+        hlfir.yield %3 : !fir.ref<f32>
+      }
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @split_schedule(
+// CHECK:           %[[VAL_3:.*]] = arith.constant 11 : i64
+// CHECK:           %[[VAL_4:.*]] = arith.constant 10 : i64
+// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "y"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "z"} : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+// CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_5]] : (i64) -> index
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
+// CHECK:           %[[VAL_11:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_12:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_11]] {
+// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (index) -> i64
+// CHECK:             %[[VAL_14:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_13]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+// CHECK:             %[[VAL_15:.*]] = fir.load %[[VAL_14]] : !fir.ref<f32>
+// CHECK:             %[[VAL_16:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_13]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+// CHECK:             hlfir.assign %[[VAL_15]] to %[[VAL_16]] : f32, !fir.ref<f32>
+// CHECK:           }
+// CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_5]] : (i64) -> index
+// CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
+// CHECK:           %[[VAL_19:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_20:.*]] = %[[VAL_17]] to %[[VAL_18]] step %[[VAL_19]] {
+// CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (index) -> i64
+// CHECK:             %[[VAL_22:.*]] = fir.convert %[[VAL_5]] : (i64) -> index
+// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
+// CHECK:             %[[VAL_24:.*]] = arith.constant 1 : index
+// CHECK:             fir.do_loop %[[VAL_25:.*]] = %[[VAL_22]] to %[[VAL_23]] step %[[VAL_24]] {
+// CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (index) -> i64
+// CHECK:               %[[VAL_27:.*]] = arith.subi %[[VAL_3]], %[[VAL_21]] : i64
+// CHECK:               %[[VAL_28:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_27]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+// CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_28]] : !fir.ref<f32>
+// CHECK:               %[[VAL_30:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_21]], %[[VAL_26]])  : (!fir.box<!fir.array<?x?xf32>>, i64, i64) -> !fir.ref<f32>
+// CHECK:               hlfir.assign %[[VAL_29]] to %[[VAL_30]] : f32, !fir.ref<f32>
+// CHECK:             }
+// CHECK:           }
+
+func.func @test_mask(%arg0: !fir.box<!fir.array<?x?xf32>>, %arg1: !fir.box<!fir.array<?x?xf32>>, %arg2: !fir.box<!fir.array<?x!fir.logical<4>>>) {
+  %c10 = arith.constant 10 : i64
+  %c1 = arith.constant 1 : i64
+  %0:2 = hlfir.declare %arg2 {uniq_name = "mask"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+  %1:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+  %2:2 = hlfir.declare %arg1 {uniq_name = "y"} : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+  hlfir.forall lb {
+    hlfir.yield %c1 : i64
+  } ub {
+    hlfir.yield %c10 : i64
+  }  (%arg3: i64) {
+    hlfir.forall_mask {
+      %3 = hlfir.designate %0#0 (%arg3)  : (!fir.box<!fir.array<?x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+      %4 = fir.load %3 : !fir.ref<!fir.logical<4>>
+      %5 = fir.convert %4 : (!fir.logical<4>) -> i1
+      hlfir.yield %5 : i1
+    } do {
+      hlfir.forall lb {
+        hlfir.yield %c1 : i64
+      } ub {
+        hlfir.yield %arg3 : i64
+      }  (%arg4: i64) {
+        hlfir.region_assign {
+          %3 = hlfir.designate %2#0 (%arg3, %arg4)  : (!fir.box<!fir.array<?x?xf32>>, i64, i64) -> !fir.ref<f32>
+          %4 = fir.load %3 : !fir.ref<f32>
+          hlfir.yield %4 : f32
+        } to {
+          %3 = hlfir.designate %1#0 (%arg3, %arg4)  : (!fir.box<!fir.array<?x?xf32>>, i64, i64) -> !fir.ref<f32>
+          hlfir.yield %3 : !fir.ref<f32>
+        }
+      }
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @test_mask(
+// CHECK:           %[[VAL_3:.*]] = arith.constant 10 : i64
+// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "mask"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "x"} : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "y"} : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
+// CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
+// CHECK:           %[[VAL_10:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_11:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_10]] {
+// CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (index) -> i64
+// CHECK:             %[[VAL_13:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_12]])  : (!fir.box<!fir.array<?x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_14:.*]] = fir.load %[[VAL_13]] : !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (!fir.logical<4>) -> i1
+// CHECK:             fir.if %[[VAL_15]] {
+// CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
+// CHECK:               %[[VAL_17:.*]] = fir.convert %[[VAL_12]] : (i64) -> index
+// CHECK:               %[[VAL_18:.*]] = arith.constant 1 : index
+// CHECK:               fir.do_loop %[[VAL_19:.*]] = %[[VAL_16]] to %[[VAL_17]] step %[[VAL_18]] {
+// CHECK:                 %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (index) -> i64
+// CHECK:                 %[[VAL_21:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_12]], %[[VAL_20]])  : (!fir.box<!fir.array<?x?xf32>>, i64, i64) -> !fir.ref<f32>
+// CHECK:                 %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.ref<f32>
+// CHECK:                 %[[VAL_23:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_12]], %[[VAL_20]])  : (!fir.box<!fir.array<?x?xf32>>, i64, i64) -> !fir.ref<f32>
+// CHECK:                 hlfir.assign %[[VAL_22]] to %[[VAL_23]] : f32, !fir.ref<f32>
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
diff --git a/flang/test/HLFIR/ordered-assignments-codegen-todo.fir b/flang/test/HLFIR/ordered-assignments-codegen-todo.fir
--- a/flang/test/HLFIR/ordered-assignments-codegen-todo.fir
+++ b/flang/test/HLFIR/ordered-assignments-codegen-todo.fir
@@ -2,9 +2,9 @@
 // RUN: %not_todo_cmd fir-opt --lower-hlfir-ordered-assignments %s 2>&1 | FileCheck %s
 
 
-// CHECK: not yet implemented: FORALL construct or statement in HLFIR
+// CHECK: not yet implemented: creating temporary storage in FORALL or WHERE constructs
 
-func.func @forall_todo(%arg0: !fir.ref<!fir.array<10xf32>>, %arg1: !fir.ref<!fir.array<10xf32>>) {
+func.func @forall_todo(%arg0: !fir.ref<!fir.array<10xf32>>) {
   %c1 = arith.constant 1 : index
   %c10 = arith.constant 10 : index
   hlfir.forall lb {
@@ -13,7 +13,7 @@
     hlfir.yield %c10 : index
   }  (%arg2: i64) {
     hlfir.region_assign {
-      %1 = hlfir.designate %arg1 (%arg2)  : (!fir.ref<!fir.array<10xf32>>, i64) -> !fir.ref<f32>
+      %1 = hlfir.designate %arg0 (%arg2)  : (!fir.ref<!fir.array<10xf32>>, i64) -> !fir.ref<f32>
       hlfir.yield %1 : !fir.ref<f32>
     } to {
       %1 = hlfir.designate %arg0 (%arg2)  : (!fir.ref<!fir.array<10xf32>>, i64) -> !fir.ref<f32>