diff --git a/flang/include/flang/Optimizer/Builder/Array.h b/flang/include/flang/Optimizer/Builder/Array.h
new file mode 100644
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/Array.h
@@ -0,0 +1,27 @@
+//===-- Array.h -------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_BUILDER_ARRAY_H
+#define FORTRAN_OPTIMIZER_BUILDER_ARRAY_H
+
+#include "flang/Optimizer/Dialect/FIROps.h"
+
+namespace fir::factory {
+
+/// Return true if and only if the extents are those of an assumed-size array.
+/// An assumed-size array in Fortran is declared with `*` as the upper bound of
+/// the last dimension of the array. Lowering converts the asterisk to an
+/// undefined value.
+inline bool isAssumedSize(const llvm::SmallVectorImpl<mlir::Value> &extents) {
+  return !extents.empty() &&
+         mlir::isa_and_nonnull<UndefOp>(extents.back().getDefiningOp());
+}
+
+} // namespace fir::factory
+
+#endif // FORTRAN_OPTIMIZER_BUILDER_ARRAY_H
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -371,6 +371,12 @@
   /// Generate code testing \p addr is a null address.
   mlir::Value genIsNull(mlir::Location loc, mlir::Value addr);
 
+  /// Compute the extent of (lb:ub:step) as max((ub-lb+step)/step, 0). See
+  /// Fortran 2018 9.5.3.3.2 section for more details.
+  mlir::Value genExtentFromTriplet(mlir::Location loc, mlir::Value lb,
+                                   mlir::Value ub, mlir::Value step,
+                                   mlir::Type type);
+
 private:
   const KindMapping &kindMap;
 };
diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
--- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
@@ -82,6 +82,10 @@
   return "fir.host_assoc";
 }
 
+/// Does the function, \p func, have a host-associations tuple argument?
+/// Some internal procedures may have access to host procedure variables.
+bool hasHostAssociationArgument(mlir::FuncOp func);
+
 /// Tell if \p value is:
 ///   - a function argument that has attribute \p attributeName
 ///   - or, the result of fir.alloca/fir.allocamem op that has attribute \p
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -507,6 +507,23 @@
                                   mlir::arith::CmpIPredicate::eq);
 }
 
+mlir::Value fir::FirOpBuilder::genExtentFromTriplet(mlir::Location loc,
+                                                    mlir::Value lb,
+                                                    mlir::Value ub,
+                                                    mlir::Value step,
+                                                    mlir::Type type) {
+  auto zero = createIntegerConstant(loc, type, 0);
+  lb = createConvert(loc, type, lb);
+  ub = createConvert(loc, type, ub);
+  step = createConvert(loc, type, step);
+  auto diff = create<mlir::arith::SubIOp>(loc, ub, lb);
+  auto add = create<mlir::arith::AddIOp>(loc, diff, step);
+  auto div = create<mlir::arith::DivSIOp>(loc, add, step);
+  auto cmp = create<mlir::arith::CmpIOp>(loc, mlir::arith::CmpIPredicate::sgt,
+                                         div, zero);
+  return create<mlir::arith::SelectOp>(loc, cmp, div, zero);
+}
+
 //===--------------------------------------------------------------------===//
 // ExtendedValue inquiry helper implementation
 //===--------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -3257,6 +3257,15 @@
   return result;
 }
 
+bool fir::hasHostAssociationArgument(mlir::FuncOp func) {
+  if (auto allArgAttrs = func.getAllArgAttrs())
+    for (auto attr : allArgAttrs)
+      if (auto dict = attr.template dyn_cast_or_null<mlir::DictionaryAttr>())
+        if (dict.get(fir::getHostAssocAttrName()))
+          return true;
+  return false;
+}
+
 bool fir::valueHasFirAttribute(mlir::Value value,
                                llvm::StringRef attributeName) {
   // If this is a fir.box that was loaded, the fir attributes will be on the
diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
--- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
+++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
@@ -7,10 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
+#include "flang/Lower/Todo.h"
+#include "flang/Optimizer/Builder/Array.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Factory.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/FIRContext.h"
 #include "flang/Optimizer/Transforms/Passes.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
@@ -60,8 +63,8 @@
 public:
   using ConflictSetT = llvm::SmallPtrSet<mlir::Operation *, 16>;
   using UseSetT = llvm::SmallPtrSet<mlir::OpOperand *, 8>;
-  using LoadMapSetsT =
-      llvm::DenseMap<mlir::Operation *, SmallVector<Operation *>>;
+  using LoadMapSetsT = llvm::DenseMap<mlir::Operation *, UseSetT>;
+  using AmendAccessSetT = llvm::SmallPtrSet<mlir::Operation *, 4>;
 
   ArrayCopyAnalysis(mlir::Operation *op) : operation{op} { construct(op); }
 
@@ -75,13 +78,27 @@
     return conflicts.contains(op);
   }
 
-  /// Return the use map. The use map maps array fetch and update operations
-  /// back to the array load that is the original source of the array value.
+  /// Return the use map.
+  /// The use map maps array access, amend, fetch and update operations back to
+  /// the array load that is the original source of the array value.
+  /// It maps an array_load to an array_merge_store, if and only if the loaded
+  /// array value has pending modifications to be merged.
   const OperationUseMapT &getUseMap() const { return useMap; }
 
-  /// Find all the array operations that access the array value that is loaded
-  /// by the array load operation, `load`.
-  const llvm::SmallVector<mlir::Operation *> &arrayAccesses(ArrayLoadOp load);
+  /// Return the set of array_access ops directly associated with array_amend
+  /// ops.
+  bool inAmendAccessSet(mlir::Operation *op) const {
+    return amendAccesses.count(op);
+  }
+
+  /// For ArrayLoad `load`, return the transitive set of all OpOperands.
+  UseSetT getLoadUseSet(mlir::Operation *load) const {
+    assert(loadMapSets.count(load) && "analysis missed an array load?");
+    return loadMapSets.lookup(load);
+  }
+
+  void arrayMentions(llvm::SmallVectorImpl<mlir::Operation *> &mentions,
+                     ArrayLoadOp load);
 
 private:
   void construct(mlir::Operation *topLevelOp);
@@ -90,36 +107,53 @@
   ConflictSetT conflicts;     // set of conflicts (loads and merge stores)
   OperationUseMapT useMap;
   LoadMapSetsT loadMapSets;
+  // Set of array_access ops associated with array_amend ops.
+  AmendAccessSetT amendAccesses;
 };
 } // namespace
 
 namespace {
 /// Helper class to collect all array operations that produced an array value.
 class ReachCollector {
-private:
-  // If provided, the `loopRegion` is the body of a loop that produces the array
-  // of interest.
+public:
   ReachCollector(llvm::SmallVectorImpl<mlir::Operation *> &reach,
                  mlir::Region *loopRegion)
       : reach{reach}, loopRegion{loopRegion} {}
 
-  void collectArrayAccessFrom(mlir::Operation *op, mlir::ValueRange range) {
-    llvm::errs() << "COLLECT " << *op << "\n";
+  void collectArrayMentionFrom(mlir::Operation *op, mlir::ValueRange range) {
     if (range.empty()) {
-      collectArrayAccessFrom(op, mlir::Value{});
+      collectArrayMentionFrom(op, mlir::Value{});
       return;
     }
     for (mlir::Value v : range)
-      collectArrayAccessFrom(v);
+      collectArrayMentionFrom(v);
+  }
+
+  // Collect all the array_access ops in `block`. This recursively looks into
+  // blocks in ops with regions.
+  // FIXME: This is temporarily relying on the array_amend appearing in a
+  // do_loop Region.  This phase ordering assumption can be eliminated by using
+  // dominance information to find the array_access ops or by scanning the
+  // transitive closure of the amending array_access's users and the defs that
+  // reach them.
+  void collectAccesses(llvm::SmallVector<ArrayAccessOp> &result,
+                       mlir::Block *block) {
+    for (auto &op : *block) {
+      if (auto access = mlir::dyn_cast<ArrayAccessOp>(op)) {
+        LLVM_DEBUG(llvm::dbgs() << "adding access: " << access << '\n');
+        result.push_back(access);
+        continue;
+      }
+      for (auto &region : op.getRegions())
+        for (auto &bb : region.getBlocks())
+          collectAccesses(result, &bb);
+    }
   }
 
-  // TODO: Replace recursive algorithm on def-use chain with an iterative one
-  // with an explicit stack.
-  void collectArrayAccessFrom(mlir::Operation *op, mlir::Value val) {
+  void collectArrayMentionFrom(mlir::Operation *op, mlir::Value val) {
     // `val` is defined by an Op, process the defining Op.
     // If `val` is defined by a region containing Op, we want to drill down
     // and through that Op's region(s).
-    llvm::errs() << "COLLECT " << *op << "\n";
     LLVM_DEBUG(llvm::dbgs() << "popset: " << *op << '\n');
     auto popFn = [&](auto rop) {
       assert(val && "op must have a result value");
@@ -127,9 +161,13 @@
       llvm::SmallVector<mlir::Value> results;
       rop.resultToSourceOps(results, resNum);
       for (auto u : results)
-        collectArrayAccessFrom(u);
+        collectArrayMentionFrom(u);
     };
-    if (auto rop = mlir::dyn_cast<fir::DoLoopOp>(op)) {
+    if (auto rop = mlir::dyn_cast<DoLoopOp>(op)) {
+      popFn(rop);
+      return;
+    }
+    if (auto rop = mlir::dyn_cast<IterWhileOp>(op)) {
       popFn(rop);
       return;
     }
@@ -137,44 +175,73 @@
       popFn(rop);
       return;
     }
+    if (auto box = mlir::dyn_cast<EmboxOp>(op)) {
+      for (auto *user : box.getMemref().getUsers())
+        if (user != op)
+          collectArrayMentionFrom(user, user->getResults());
+      return;
+    }
     if (auto mergeStore = mlir::dyn_cast<ArrayMergeStoreOp>(op)) {
       if (opIsInsideLoops(mergeStore))
-        collectArrayAccessFrom(mergeStore.getSequence());
+        collectArrayMentionFrom(mergeStore.getSequence());
       return;
     }
 
     if (mlir::isa<AllocaOp, AllocMemOp>(op)) {
       // Look for any stores inside the loops, and collect an array operation
       // that produced the value being stored to it.
-      for (mlir::Operation *user : op->getUsers())
+      for (auto *user : op->getUsers())
         if (auto store = mlir::dyn_cast<fir::StoreOp>(user))
           if (opIsInsideLoops(store))
-            collectArrayAccessFrom(store.getValue());
+            collectArrayMentionFrom(store.getValue());
       return;
     }
 
+    // Scan the uses of amend's memref
+    if (auto amend = mlir::dyn_cast<ArrayAmendOp>(op)) {
+      reach.push_back(op);
+      llvm::SmallVector<ArrayAccessOp> accesses;
+      collectAccesses(accesses, op->getBlock());
+      for (auto access : accesses)
+        collectArrayMentionFrom(access.getResult());
+    }
+
     // Otherwise, Op does not contain a region so just chase its operands.
-    if (mlir::isa<ArrayLoadOp, ArrayUpdateOp, ArrayModifyOp, ArrayFetchOp>(
-            op)) {
+    if (mlir::isa<ArrayAccessOp, ArrayLoadOp, ArrayUpdateOp, ArrayModifyOp,
+                  ArrayFetchOp>(op)) {
       LLVM_DEBUG(llvm::dbgs() << "add " << *op << " to reachable set\n");
-      reach.emplace_back(op);
+      reach.push_back(op);
     }
-    // Array modify assignment is performed on the result. So the analysis
-    // must look at the what is done with the result.
+
+    // Include all array_access ops using an array_load.
+    if (auto arrLd = mlir::dyn_cast<ArrayLoadOp>(op))
+      for (auto *user : arrLd.getResult().getUsers())
+        if (mlir::isa<ArrayAccessOp>(user)) {
+          LLVM_DEBUG(llvm::dbgs() << "add " << *user << " to reachable set\n");
+          reach.push_back(user);
+        }
+
+    // Array modify assignment is performed on the result. So the analysis must
+    // look at the what is done with the result.
     if (mlir::isa<ArrayModifyOp>(op))
-      for (mlir::Operation *user : op->getResult(0).getUsers())
+      for (auto *user : op->getResult(0).getUsers())
         followUsers(user);
 
+    if (mlir::isa<fir::CallOp>(op)) {
+      LLVM_DEBUG(llvm::dbgs() << "add " << *op << " to reachable set\n");
+      reach.push_back(op);
+    }
+
     for (auto u : op->getOperands())
-      collectArrayAccessFrom(u);
+      collectArrayMentionFrom(u);
   }
 
-  void collectArrayAccessFrom(mlir::BlockArgument ba) {
+  void collectArrayMentionFrom(mlir::BlockArgument ba) {
     auto *parent = ba.getOwner()->getParentOp();
     // If inside an Op holding a region, the block argument corresponds to an
     // argument passed to the containing Op.
     auto popFn = [&](auto rop) {
-      collectArrayAccessFrom(rop.blockArgToSourceOp(ba.getArgNumber()));
+      collectArrayMentionFrom(rop.blockArgToSourceOp(ba.getArgNumber()));
     };
     if (auto rop = mlir::dyn_cast<DoLoopOp>(parent)) {
       popFn(rop);
@@ -187,46 +254,64 @@
     // Otherwise, a block argument is provided via the pred blocks.
     for (auto *pred : ba.getOwner()->getPredecessors()) {
       auto u = pred->getTerminator()->getOperand(ba.getArgNumber());
-      collectArrayAccessFrom(u);
+      collectArrayMentionFrom(u);
     }
   }
 
   // Recursively trace operands to find all array operations relating to the
   // values merged.
-  void collectArrayAccessFrom(mlir::Value val) {
+  void collectArrayMentionFrom(mlir::Value val) {
     if (!val || visited.contains(val))
       return;
     visited.insert(val);
 
     // Process a block argument.
     if (auto ba = val.dyn_cast<mlir::BlockArgument>()) {
-      collectArrayAccessFrom(ba);
+      collectArrayMentionFrom(ba);
       return;
     }
 
     // Process an Op.
     if (auto *op = val.getDefiningOp()) {
-      collectArrayAccessFrom(op, val);
+      collectArrayMentionFrom(op, val);
       return;
     }
 
-    fir::emitFatalError(val.getLoc(), "unhandled value");
+    emitFatalError(val.getLoc(), "unhandled value");
+  }
+
+  /// Return all ops that produce the array value that is stored into the
+  /// `array_merge_store`.
+  static void reachingValues(llvm::SmallVectorImpl<mlir::Operation *> &reach,
+                             mlir::Value seq) {
+    reach.clear();
+    mlir::Region *loopRegion = nullptr;
+    if (auto doLoop = mlir::dyn_cast_or_null<DoLoopOp>(seq.getDefiningOp()))
+      loopRegion = &doLoop->getRegion(0);
+    ReachCollector collector(reach, loopRegion);
+    collector.collectArrayMentionFrom(seq);
   }
 
+private:
   /// Is \op inside the loop nest region ?
+  /// FIXME: replace this structural dependence with graph properties.
   bool opIsInsideLoops(mlir::Operation *op) const {
-    return loopRegion && loopRegion->isAncestor(op->getParentRegion());
+    auto *region = op->getParentRegion();
+    while (region) {
+      if (region == loopRegion)
+        return true;
+      region = region->getParentRegion();
+    }
+    return false;
   }
 
   /// Recursively trace the use of an operation results, calling
-  /// collectArrayAccessFrom on the direct and indirect user operands.
-  /// TODO: Replace recursive algorithm on def-use chain with an iterative one
-  /// with an explicit stack.
+  /// collectArrayMentionFrom on the direct and indirect user operands.
   void followUsers(mlir::Operation *op) {
     for (auto userOperand : op->getOperands())
-      collectArrayAccessFrom(userOperand);
+      collectArrayMentionFrom(userOperand);
     // Go through potential converts/coordinate_op.
-    for (mlir::Operation *indirectUser : op->getUsers())
+    for (auto indirectUser : op->getUsers())
       followUsers(indirectUser);
   }
 
@@ -234,39 +319,30 @@
   llvm::SmallPtrSet<mlir::Value, 16> visited;
   /// Region of the loops nest that produced the array value.
   mlir::Region *loopRegion;
-
-public:
-  /// Return all ops that produce the array value that is stored into the
-  /// `array_merge_store`.
-  static void reachingValues(llvm::SmallVectorImpl<mlir::Operation *> &reach,
-                             mlir::Value seq) {
-    reach.clear();
-    mlir::Region *loopRegion = nullptr;
-    // Only `DoLoopOp` is tested here since array operations are currently only
-    // associated with this kind of loop.
-    if (auto doLoop =
-            mlir::dyn_cast_or_null<fir::DoLoopOp>(seq.getDefiningOp()))
-      loopRegion = &doLoop->getRegion(0);
-    ReachCollector collector(reach, loopRegion);
-    collector.collectArrayAccessFrom(seq);
-  }
 };
 } // namespace
 
 /// Find all the array operations that access the array value that is loaded by
 /// the array load operation, `load`.
-const llvm::SmallVector<mlir::Operation *> &
-ArrayCopyAnalysis::arrayAccesses(ArrayLoadOp load) {
+void ArrayCopyAnalysis::arrayMentions(
+    llvm::SmallVectorImpl<mlir::Operation *> &mentions, ArrayLoadOp load) {
+  mentions.clear();
   auto lmIter = loadMapSets.find(load);
-  if (lmIter != loadMapSets.end())
-    return lmIter->getSecond();
+  if (lmIter != loadMapSets.end()) {
+    for (auto *opnd : lmIter->second) {
+      auto *owner = opnd->getOwner();
+      if (mlir::isa<ArrayAccessOp, ArrayAmendOp, ArrayFetchOp, ArrayUpdateOp,
+                    ArrayModifyOp>(owner))
+        mentions.push_back(owner);
+    }
+    return;
+  }
 
-  llvm::SmallVector<mlir::Operation *> accesses;
   UseSetT visited;
   llvm::SmallVector<mlir::OpOperand *> queue; // uses of ArrayLoad[orig]
 
   auto appendToQueue = [&](mlir::Value val) {
-    for (mlir::OpOperand &use : val.getUses())
+    for (auto &use : val.getUses())
       if (!visited.count(&use)) {
         visited.insert(&use);
         queue.push_back(&use);
@@ -281,7 +357,8 @@
   while (!queue.empty()) {
     mlir::OpOperand *operand = queue.pop_back_val();
     mlir::Operation *owner = operand->getOwner();
-
+    if (!owner)
+      continue;
     auto structuredLoop = [&](auto ro) {
       if (auto blockArg = ro.iterArgToBlockArg(operand->get())) {
         int64_t arg = blockArg.getArgNumber();
@@ -313,26 +390,32 @@
       structuredLoop(ro);
     } else if (auto rs = mlir::dyn_cast<ResultOp>(owner)) {
       // Thread any uses of fir.if that return the marked array value.
-      if (auto ifOp = rs->getParentOfType<fir::IfOp>())
+      mlir::Operation *parent = rs->getParentRegion()->getParentOp();
+      if (auto ifOp = mlir::dyn_cast<fir::IfOp>(parent))
         appendToQueue(ifOp.getResult(operand->getOperandNumber()));
     } else if (mlir::isa<ArrayFetchOp>(owner)) {
       // Keep track of array value fetches.
       LLVM_DEBUG(llvm::dbgs()
                  << "add fetch {" << *owner << "} to array value set\n");
-      accesses.push_back(owner);
+      mentions.push_back(owner);
     } else if (auto update = mlir::dyn_cast<ArrayUpdateOp>(owner)) {
       // Keep track of array value updates and thread the return value uses.
       LLVM_DEBUG(llvm::dbgs()
                  << "add update {" << *owner << "} to array value set\n");
-      accesses.push_back(owner);
+      mentions.push_back(owner);
       appendToQueue(update.getResult());
     } else if (auto update = mlir::dyn_cast<ArrayModifyOp>(owner)) {
       // Keep track of array value modification and thread the return value
       // uses.
       LLVM_DEBUG(llvm::dbgs()
                  << "add modify {" << *owner << "} to array value set\n");
-      accesses.push_back(owner);
+      mentions.push_back(owner);
       appendToQueue(update.getResult(1));
+    } else if (auto mention = mlir::dyn_cast<ArrayAccessOp>(owner)) {
+      mentions.push_back(owner);
+    } else if (auto amend = mlir::dyn_cast<ArrayAmendOp>(owner)) {
+      mentions.push_back(owner);
+      appendToQueue(amend.getResult());
     } else if (auto br = mlir::dyn_cast<mlir::cf::BranchOp>(owner)) {
       branchOp(br.getDest(), br.getDestOperands());
     } else if (auto br = mlir::dyn_cast<mlir::cf::CondBranchOp>(owner)) {
@@ -344,7 +427,107 @@
       llvm::report_fatal_error("array value reached unexpected op");
     }
   }
-  return loadMapSets.insert({load, accesses}).first->getSecond();
+  loadMapSets.insert({load, visited});
+}
+
+static bool hasPointerType(mlir::Type type) {
+  if (auto boxTy = type.dyn_cast<BoxType>())
+    type = boxTy.getEleTy();
+  return type.isa<fir::PointerType>();
+}
+
+// This is a NF performance hack. It makes a simple test that the slices of the
+// load, \p ld, and the merge store, \p st, are trivially mutually exclusive.
+static bool mutuallyExclusiveSliceRange(ArrayLoadOp ld, ArrayMergeStoreOp st) {
+  // If the same array_load, then no further testing is warranted.
+  if (ld.getResult() == st.getOriginal())
+    return false;
+
+  auto getSliceOp = [](mlir::Value val) -> SliceOp {
+    if (!val)
+      return {};
+    auto sliceOp = mlir::dyn_cast_or_null<SliceOp>(val.getDefiningOp());
+    if (!sliceOp)
+      return {};
+    return sliceOp;
+  };
+
+  auto ldSlice = getSliceOp(ld.getSlice());
+  auto stSlice = getSliceOp(st.getSlice());
+  if (!ldSlice || !stSlice)
+    return false;
+
+  // Resign on subobject slices.
+  if (!ldSlice.getFields().empty() || !stSlice.getFields().empty() ||
+      !ldSlice.getSubstr().empty() || !stSlice.getSubstr().empty())
+    return false;
+
+  // Crudely test that the two slices do not overlap by looking for the
+  // following general condition. If the slices look like (i:j) and (j+1:k) then
+  // these ranges do not overlap. The addend must be a constant.
+  auto ldTriples = ldSlice.getTriples();
+  auto stTriples = stSlice.getTriples();
+  const auto size = ldTriples.size();
+  if (size != stTriples.size())
+    return false;
+
+  auto displacedByConstant = [](mlir::Value v1, mlir::Value v2) {
+    auto removeConvert = [](mlir::Value v) -> mlir::Operation * {
+      auto *op = v.getDefiningOp();
+      while (auto conv = mlir::dyn_cast_or_null<ConvertOp>(op))
+        op = conv.getValue().getDefiningOp();
+      return op;
+    };
+
+    auto isPositiveConstant = [](mlir::Value v) -> bool {
+      if (auto conOp =
+              mlir::dyn_cast<mlir::arith::ConstantOp>(v.getDefiningOp()))
+        if (auto iattr = conOp.getValue().dyn_cast<mlir::IntegerAttr>())
+          return iattr.getInt() > 0;
+      return false;
+    };
+
+    auto *op1 = removeConvert(v1);
+    auto *op2 = removeConvert(v2);
+    if (!op1 || !op2)
+      return false;
+    if (auto addi = mlir::dyn_cast<mlir::arith::AddIOp>(op2))
+      if ((addi.getLhs().getDefiningOp() == op1 &&
+           isPositiveConstant(addi.getRhs())) ||
+          (addi.getRhs().getDefiningOp() == op1 &&
+           isPositiveConstant(addi.getLhs())))
+        return true;
+    if (auto subi = mlir::dyn_cast<mlir::arith::SubIOp>(op1))
+      if (subi.getLhs().getDefiningOp() == op2 &&
+          isPositiveConstant(subi.getRhs()))
+        return true;
+    return false;
+  };
+
+  for (std::remove_const_t<decltype(size)> i = 0; i < size; i += 3) {
+    // If both are loop invariant, skip to the next triple.
+    if (mlir::isa_and_nonnull<fir::UndefOp>(ldTriples[i + 1].getDefiningOp()) &&
+        mlir::isa_and_nonnull<fir::UndefOp>(stTriples[i + 1].getDefiningOp())) {
+      // Unless either is a vector index, then be conservative.
+      if (mlir::isa_and_nonnull<fir::UndefOp>(ldTriples[i].getDefiningOp()) ||
+          mlir::isa_and_nonnull<fir::UndefOp>(stTriples[i].getDefiningOp()))
+        return false;
+      continue;
+    }
+    // If identical, skip to the next triple.
+    if (ldTriples[i] == stTriples[i] && ldTriples[i + 1] == stTriples[i + 1] &&
+        ldTriples[i + 2] == stTriples[i + 2])
+      continue;
+    // If ubound and lbound are the same with a constant offset, skip to the
+    // next triple.
+    if (displacedByConstant(ldTriples[i + 1], stTriples[i]) ||
+        displacedByConstant(stTriples[i + 1], ldTriples[i]))
+      continue;
+    return false;
+  }
+  LLVM_DEBUG(llvm::dbgs() << "detected non-overlapping slice ranges on " << ld
+                          << " and " << st << ", which is not a conflict\n");
+  return true;
 }
 
 /// Is there a conflict between the array value that was updated and to be
@@ -354,67 +537,132 @@
                            ArrayMergeStoreOp st) {
   mlir::Value load;
   mlir::Value addr = st.getMemref();
-  auto stEleTy = fir::dyn_cast_ptrOrBoxEleTy(addr.getType());
-  for (auto *op : reach) {
-    auto ld = mlir::dyn_cast<ArrayLoadOp>(op);
-    if (!ld)
-      continue;
-    mlir::Type ldTy = ld.getMemref().getType();
-    if (auto boxTy = ldTy.dyn_cast<fir::BoxType>())
-      ldTy = boxTy.getEleTy();
-    if (ldTy.isa<fir::PointerType>() && stEleTy == dyn_cast_ptrEleTy(ldTy))
-      return true;
-    if (ld.getMemref() == addr) {
-      if (ld.getResult() != st.getOriginal())
-        return true;
-      if (load)
+  const bool storeHasPointerType = hasPointerType(addr.getType());
+  for (auto *op : reach)
+    if (auto ld = mlir::dyn_cast<ArrayLoadOp>(op)) {
+      mlir::Type ldTy = ld.getMemref().getType();
+      if (ld.getMemref() == addr) {
+        if (mutuallyExclusiveSliceRange(ld, st))
+          continue;
+        if (ld.getResult() != st.getOriginal())
+          return true;
+        if (load) {
+          // TODO: extend this to allow checking if the first `load` and this
+          // `ld` are mutually exclusive accesses but not identical.
+          return true;
+        }
+        load = ld;
+      } else if ((hasPointerType(ldTy) || storeHasPointerType)) {
+        // TODO: Use target attribute to restrict this case further.
+        // TODO: Check if types can also allow ruling out some cases. For now,
+        // the fact that equivalences is using pointer attribute to enforce
+        // aliasing is preventing any attempt to do so, and in general, it may
+        // be wrong to use this if any of the types is a complex or a derived
+        // for which it is possible to create a pointer to a part with a
+        // different type than the whole, although this deserve some more
+        // investigation because existing compiler behavior seem to diverge
+        // here.
         return true;
-      load = ld;
+      }
     }
-  }
   return false;
 }
 
-/// Check if there is any potential conflict in the chained update operations
-/// (ArrayFetchOp, ArrayUpdateOp, ArrayModifyOp) while merging back to the
-/// array. A potential conflict is detected if two operations work on the same
-/// indices.
-static bool conflictOnMerge(llvm::ArrayRef<mlir::Operation *> accesses) {
-  if (accesses.size() < 2)
+/// Is there an access vector conflict on the array being merged into? If the
+/// access vectors diverge, then assume that there are potentially overlapping
+/// loop-carried references.
+static bool conflictOnMerge(llvm::ArrayRef<mlir::Operation *> mentions) {
+  if (mentions.size() < 2)
     return false;
   llvm::SmallVector<mlir::Value> indices;
-  LLVM_DEBUG(llvm::dbgs() << "check merge conflict on with " << accesses.size()
-                          << " accesses on the list\n");
-  for (auto *op : accesses) {
-    assert((mlir::isa<ArrayFetchOp, ArrayUpdateOp, ArrayModifyOp>(op)) &&
-           "unexpected operation in analysis");
+  LLVM_DEBUG(llvm::dbgs() << "check merge conflict on with " << mentions.size()
+                          << " mentions on the list\n");
+  bool valSeen = false;
+  bool refSeen = false;
+  for (auto *op : mentions) {
     llvm::SmallVector<mlir::Value> compareVector;
     if (auto u = mlir::dyn_cast<ArrayUpdateOp>(op)) {
+      valSeen = true;
       if (indices.empty()) {
         indices = u.getIndices();
         continue;
       }
       compareVector = u.getIndices();
     } else if (auto f = mlir::dyn_cast<ArrayModifyOp>(op)) {
+      valSeen = true;
       if (indices.empty()) {
         indices = f.getIndices();
         continue;
       }
       compareVector = f.getIndices();
     } else if (auto f = mlir::dyn_cast<ArrayFetchOp>(op)) {
+      valSeen = true;
+      if (indices.empty()) {
+        indices = f.getIndices();
+        continue;
+      }
+      compareVector = f.getIndices();
+    } else if (auto f = mlir::dyn_cast<ArrayAccessOp>(op)) {
+      refSeen = true;
       if (indices.empty()) {
         indices = f.getIndices();
         continue;
       }
       compareVector = f.getIndices();
+    } else if (mlir::isa<ArrayAmendOp>(op)) {
+      refSeen = true;
+      continue;
+    } else {
+      mlir::emitError(op->getLoc(), "unexpected operation in analysis");
     }
-    if (compareVector != indices)
+    if (compareVector.size() != indices.size() ||
+        llvm::any_of(llvm::zip(compareVector, indices), [&](auto pair) {
+          return std::get<0>(pair) != std::get<1>(pair);
+        }))
       return true;
     LLVM_DEBUG(llvm::dbgs() << "vectors compare equal\n");
   }
+  return valSeen && refSeen;
+}
+
+/// With element-by-reference semantics, an amended array with more than once
+/// access to the same loaded array are conservatively considered a conflict.
+/// Note: the array copy can still be eliminated in subsequent optimizations.
+static bool conflictOnReference(llvm::ArrayRef<mlir::Operation *> mentions) {
+  LLVM_DEBUG(llvm::dbgs() << "checking reference semantics " << mentions.size()
+                          << '\n');
+  if (mentions.size() < 3)
+    return false;
+  unsigned amendCount = 0;
+  unsigned accessCount = 0;
+  for (auto *op : mentions) {
+    if (mlir::isa<ArrayAmendOp>(op) && ++amendCount > 1) {
+      LLVM_DEBUG(llvm::dbgs() << "conflict: multiple amends of array value\n");
+      return true;
+    }
+    if (mlir::isa<ArrayAccessOp>(op) && ++accessCount > 1) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "conflict: multiple accesses of array value\n");
+      return true;
+    }
+    if (mlir::isa<ArrayFetchOp, ArrayUpdateOp, ArrayModifyOp>(op)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "conflict: array value has both uses by-value and uses "
+                    "by-reference. conservative assumption.\n");
+      return true;
+    }
+  }
   return false;
 }
 
+static mlir::Operation *
+amendingAccess(llvm::ArrayRef<mlir::Operation *> mentions) {
+  for (auto *op : mentions)
+    if (auto amend = mlir::dyn_cast<ArrayAmendOp>(op))
+      return amend.getMemref().getDefiningOp();
+  return {};
+}
+
 // Are either of types of conflicts present?
 inline bool conflictDetected(llvm::ArrayRef<mlir::Operation *> reach,
                              llvm::ArrayRef<mlir::Operation *> accesses,
@@ -422,49 +670,78 @@
   return conflictOnLoad(reach, st) || conflictOnMerge(accesses);
 }
 
+// Assume that any call to a function that uses host-associations will be
+// modifying the output array.
+static bool
+conservativeCallConflict(llvm::ArrayRef<mlir::Operation *> reaches) {
+  return llvm::any_of(reaches, [](mlir::Operation *op) {
+    if (auto call = mlir::dyn_cast<fir::CallOp>(op))
+      if (auto callee =
+              call.getCallableForCallee().dyn_cast<mlir::SymbolRefAttr>()) {
+        auto module = op->getParentOfType<mlir::ModuleOp>();
+        return hasHostAssociationArgument(
+            module.lookupSymbol<mlir::FuncOp>(callee));
+      }
+    return false;
+  });
+}
+
 /// Constructor of the array copy analysis.
 /// This performs the analysis and saves the intermediate results.
 void ArrayCopyAnalysis::construct(mlir::Operation *topLevelOp) {
   topLevelOp->walk([&](Operation *op) {
     if (auto st = mlir::dyn_cast<fir::ArrayMergeStoreOp>(op)) {
-      llvm::SmallVector<Operation *> values;
+      llvm::SmallVector<mlir::Operation *> values;
       ReachCollector::reachingValues(values, st.getSequence());
-      const llvm::SmallVector<Operation *> &accesses = arrayAccesses(
-          mlir::cast<ArrayLoadOp>(st.getOriginal().getDefiningOp()));
-      if (conflictDetected(values, accesses, st)) {
+      bool callConflict = conservativeCallConflict(values);
+      llvm::SmallVector<mlir::Operation *> mentions;
+      arrayMentions(mentions,
+                    mlir::cast<ArrayLoadOp>(st.getOriginal().getDefiningOp()));
+      bool conflict = conflictDetected(values, mentions, st);
+      bool refConflict = conflictOnReference(mentions);
+      if (callConflict || conflict || refConflict) {
         LLVM_DEBUG(llvm::dbgs()
                    << "CONFLICT: copies required for " << st << '\n'
                    << "   adding conflicts on: " << op << " and "
                    << st.getOriginal() << '\n');
         conflicts.insert(op);
         conflicts.insert(st.getOriginal().getDefiningOp());
+        if (auto *access = amendingAccess(mentions))
+          amendAccesses.insert(access);
       }
       auto *ld = st.getOriginal().getDefiningOp();
       LLVM_DEBUG(llvm::dbgs()
                  << "map: adding {" << *ld << " -> " << st << "}\n");
       useMap.insert({ld, op});
     } else if (auto load = mlir::dyn_cast<ArrayLoadOp>(op)) {
-      const llvm::SmallVector<mlir::Operation *> &accesses =
-          arrayAccesses(load);
+      llvm::SmallVector<mlir::Operation *> mentions;
+      arrayMentions(mentions, load);
       LLVM_DEBUG(llvm::dbgs() << "process load: " << load
-                              << ", accesses: " << accesses.size() << '\n');
-      for (auto *acc : accesses) {
-        LLVM_DEBUG(llvm::dbgs() << " access: " << *acc << '\n');
-        assert((mlir::isa<ArrayFetchOp, ArrayUpdateOp, ArrayModifyOp>(acc)));
-        if (!useMap.insert({acc, op}).second) {
-          mlir::emitError(
-              load.getLoc(),
-              "The parallel semantics of multiple array_merge_stores per "
-              "array_load are not supported.");
-          return;
+                              << ", mentions: " << mentions.size() << '\n');
+      for (auto *acc : mentions) {
+        LLVM_DEBUG(llvm::dbgs() << " mention: " << *acc << '\n');
+        if (mlir::isa<ArrayAccessOp, ArrayAmendOp, ArrayFetchOp, ArrayUpdateOp,
+                      ArrayModifyOp>(acc)) {
+          if (useMap.count(acc)) {
+            mlir::emitError(
+                load.getLoc(),
+                "The parallel semantics of multiple array_merge_stores per "
+                "array_load are not supported.");
+            continue;
+          }
+          LLVM_DEBUG(llvm::dbgs()
+                     << "map: adding {" << *acc << "} -> {" << load << "}\n");
+          useMap.insert({acc, op});
         }
-        LLVM_DEBUG(llvm::dbgs()
-                   << "map: adding {" << *acc << "} -> {" << load << "}\n");
       }
     }
   });
 }
 
+//===----------------------------------------------------------------------===//
+// Conversions for converting out of array value form.
+//===----------------------------------------------------------------------===//
+
 namespace {
 class ArrayLoadConversion : public mlir::OpRewritePattern<ArrayLoadOp> {
 public:
@@ -495,60 +772,93 @@
 } // namespace
 
 static mlir::Type getEleTy(mlir::Type ty) {
-  if (auto t = dyn_cast_ptrEleTy(ty))
-    ty = t;
-  if (auto t = ty.dyn_cast<SequenceType>())
-    ty = t.getEleTy();
+  auto eleTy = unwrapSequenceType(unwrapPassByRefType(ty));
   // FIXME: keep ptr/heap/ref information.
-  return ReferenceType::get(ty);
+  return ReferenceType::get(eleTy);
 }
 
 // Extract extents from the ShapeOp/ShapeShiftOp into the result vector.
-// TODO: getExtents on op should return a ValueRange instead of a vector.
-static void getExtents(llvm::SmallVectorImpl<mlir::Value> &result,
-                       mlir::Value shape) {
+static bool getAdjustedExtents(mlir::Location loc,
+                               mlir::PatternRewriter &rewriter,
+                               ArrayLoadOp arrLoad,
+                               llvm::SmallVectorImpl<mlir::Value> &result,
+                               mlir::Value shape) {
+  bool copyUsingSlice = false;
   auto *shapeOp = shape.getDefiningOp();
-  if (auto s = mlir::dyn_cast<fir::ShapeOp>(shapeOp)) {
+  if (auto s = mlir::dyn_cast_or_null<ShapeOp>(shapeOp)) {
     auto e = s.getExtents();
     result.insert(result.end(), e.begin(), e.end());
-    return;
-  }
-  if (auto s = mlir::dyn_cast<fir::ShapeShiftOp>(shapeOp)) {
+  } else if (auto s = mlir::dyn_cast_or_null<ShapeShiftOp>(shapeOp)) {
     auto e = s.getExtents();
     result.insert(result.end(), e.begin(), e.end());
-    return;
+  } else {
+    emitFatalError(loc, "not a fir.shape/fir.shape_shift op");
+  }
+  auto idxTy = rewriter.getIndexType();
+  if (factory::isAssumedSize(result)) {
+    // Use slice information to compute the extent of the column.
+    auto one = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 1);
+    mlir::Value size = one;
+    if (mlir::Value sliceArg = arrLoad.getSlice()) {
+      if (auto sliceOp =
+              mlir::dyn_cast_or_null<SliceOp>(sliceArg.getDefiningOp())) {
+        auto triples = sliceOp.getTriples();
+        const std::size_t tripleSize = triples.size();
+        auto module = arrLoad->getParentOfType<mlir::ModuleOp>();
+        FirOpBuilder builder(rewriter, getKindMapping(module));
+        size = builder.genExtentFromTriplet(loc, triples[tripleSize - 3],
+                                            triples[tripleSize - 2],
+                                            triples[tripleSize - 1], idxTy);
+        copyUsingSlice = true;
+      }
+    }
+    result[result.size() - 1] = size;
   }
-  llvm::report_fatal_error("not a fir.shape/fir.shape_shift op");
+  return copyUsingSlice;
 }
 
-// Place the extents of the array loaded by an ArrayLoadOp into the result
-// vector and return a ShapeOp/ShapeShiftOp with the corresponding extents. If
-// the ArrayLoadOp is loading a fir.box, code will be generated to read the
-// extents from the fir.box, and a the retunred ShapeOp is built with the read
-// extents.
-// Otherwise, the extents will be extracted from the ShapeOp/ShapeShiftOp
-// argument of the ArrayLoadOp that is returned.
-static mlir::Value
-getOrReadExtentsAndShapeOp(mlir::Location loc, mlir::PatternRewriter &rewriter,
-                           fir::ArrayLoadOp loadOp,
-                           llvm::SmallVectorImpl<mlir::Value> &result) {
+/// Place the extents of the array load, \p arrLoad, into \p result and
+/// return a ShapeOp or ShapeShiftOp with the same extents. If \p arrLoad is
+/// loading a `!fir.box`, code will be generated to read the extents from the
+/// boxed value, and the retunred shape Op will be built with the extents read
+/// from the box. Otherwise, the extents will be extracted from the ShapeOp (or
+/// ShapeShiftOp) argument of \p arrLoad. \p copyUsingSlice will be set to true
+/// if slicing of the output array is to be done in the copy-in/copy-out rather
+/// than in the elemental computation step.
+static mlir::Value getOrReadExtentsAndShapeOp(
+    mlir::Location loc, mlir::PatternRewriter &rewriter, ArrayLoadOp arrLoad,
+    llvm::SmallVectorImpl<mlir::Value> &result, bool &copyUsingSlice) {
   assert(result.empty());
-  if (auto boxTy = loadOp.getMemref().getType().dyn_cast<fir::BoxType>()) {
-    auto rank = fir::dyn_cast_ptrOrBoxEleTy(boxTy)
-                    .cast<fir::SequenceType>()
-                    .getDimension();
+  if (arrLoad->hasAttr(fir::getOptionalAttrName()))
+    fir::emitFatalError(
+        loc, "shapes from array load of OPTIONAL arrays must not be used");
+  if (auto boxTy = arrLoad.getMemref().getType().dyn_cast<BoxType>()) {
+    auto rank =
+        dyn_cast_ptrOrBoxEleTy(boxTy).cast<SequenceType>().getDimension();
     auto idxTy = rewriter.getIndexType();
     for (decltype(rank) dim = 0; dim < rank; ++dim) {
-      auto dimVal = rewriter.create<arith::ConstantIndexOp>(loc, dim);
-      auto dimInfo = rewriter.create<fir::BoxDimsOp>(
-          loc, idxTy, idxTy, idxTy, loadOp.getMemref(), dimVal);
+      auto dimVal = rewriter.create<mlir::arith::ConstantIndexOp>(loc, dim);
+      auto dimInfo = rewriter.create<BoxDimsOp>(loc, idxTy, idxTy, idxTy,
+                                                arrLoad.getMemref(), dimVal);
       result.emplace_back(dimInfo.getResult(1));
     }
-    auto shapeType = fir::ShapeType::get(rewriter.getContext(), rank);
-    return rewriter.create<fir::ShapeOp>(loc, shapeType, result);
+    if (!arrLoad.getShape()) {
+      auto shapeType = ShapeType::get(rewriter.getContext(), rank);
+      return rewriter.create<ShapeOp>(loc, shapeType, result);
+    }
+    auto shiftOp = arrLoad.getShape().getDefiningOp<ShiftOp>();
+    auto shapeShiftType = ShapeShiftType::get(rewriter.getContext(), rank);
+    llvm::SmallVector<mlir::Value> shapeShiftOperands;
+    for (auto [lb, extent] : llvm::zip(shiftOp.getOrigins(), result)) {
+      shapeShiftOperands.push_back(lb);
+      shapeShiftOperands.push_back(extent);
+    }
+    return rewriter.create<ShapeShiftOp>(loc, shapeShiftType,
+                                         shapeShiftOperands);
   }
-  getExtents(result, loadOp.getShape());
-  return loadOp.getShape();
+  copyUsingSlice =
+      getAdjustedExtents(loc, rewriter, arrLoad, result, arrLoad.getShape());
+  return arrLoad.getShape();
 }
 
 static mlir::Type toRefType(mlir::Type ty) {
@@ -582,6 +892,121 @@
   return result;
 }
 
+static mlir::Value getCharacterLen(mlir::Location loc, FirOpBuilder &builder,
+                                   ArrayLoadOp load, CharacterType charTy) {
+  auto charLenTy = builder.getCharacterLengthType();
+  if (charTy.hasDynamicLen()) {
+    if (load.getMemref().getType().isa<BoxType>()) {
+      // The loaded array is an emboxed value. Get the CHARACTER length from
+      // the box value.
+      auto eleSzInBytes =
+          builder.create<BoxEleSizeOp>(loc, charLenTy, load.getMemref());
+      auto kindSize =
+          builder.getKindMap().getCharacterBitsize(charTy.getFKind());
+      auto kindByteSize =
+          builder.createIntegerConstant(loc, charLenTy, kindSize / 8);
+      return builder.create<mlir::arith::DivSIOp>(loc, eleSzInBytes,
+                                                  kindByteSize);
+    }
+    // The loaded array is a (set of) unboxed values. If the CHARACTER's
+    // length is not a constant, it must be provided as a type parameter to
+    // the array_load.
+    auto typeparams = load.getTypeparams();
+    assert(typeparams.size() > 0 && "expected type parameters on array_load");
+    return typeparams.back();
+  }
+  // The typical case: the length of the CHARACTER is a compile-time
+  // constant that is encoded in the type information.
+  return builder.createIntegerConstant(loc, charLenTy, charTy.getLen());
+}
+/// Generate a shallow array copy. This is used for both copy-in and copy-out.
+template <bool CopyIn>
+void genArrayCopy(mlir::Location loc, mlir::PatternRewriter &rewriter,
+                  mlir::Value dst, mlir::Value src, mlir::Value shapeOp,
+                  mlir::Value sliceOp, ArrayLoadOp arrLoad) {
+  auto insPt = rewriter.saveInsertionPoint();
+  llvm::SmallVector<mlir::Value> indices;
+  llvm::SmallVector<mlir::Value> extents;
+  bool copyUsingSlice =
+      getAdjustedExtents(loc, rewriter, arrLoad, extents, shapeOp);
+  auto idxTy = rewriter.getIndexType();
+  // Build loop nest from column to row.
+  for (auto sh : llvm::reverse(extents)) {
+    auto ubi = rewriter.create<ConvertOp>(loc, idxTy, sh);
+    auto zero = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 0);
+    auto one = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 1);
+    auto ub = rewriter.create<mlir::arith::SubIOp>(loc, idxTy, ubi, one);
+    auto loop = rewriter.create<DoLoopOp>(loc, zero, ub, one);
+    rewriter.setInsertionPointToStart(loop.getBody());
+    indices.push_back(loop.getInductionVar());
+  }
+  // Reverse the indices so they are in column-major order.
+  std::reverse(indices.begin(), indices.end());
+  auto typeparams = arrLoad.getTypeparams();
+  auto fromAddr = rewriter.create<ArrayCoorOp>(
+      loc, getEleTy(src.getType()), src, shapeOp,
+      CopyIn && copyUsingSlice ? sliceOp : mlir::Value{},
+      factory::originateIndices(loc, rewriter, src.getType(), shapeOp, indices),
+      typeparams);
+  auto toAddr = rewriter.create<ArrayCoorOp>(
+      loc, getEleTy(dst.getType()), dst, shapeOp,
+      !CopyIn && copyUsingSlice ? sliceOp : mlir::Value{},
+      factory::originateIndices(loc, rewriter, dst.getType(), shapeOp, indices),
+      typeparams);
+  auto eleTy = unwrapSequenceType(unwrapPassByRefType(dst.getType()));
+  auto module = toAddr->getParentOfType<mlir::ModuleOp>();
+  FirOpBuilder builder(rewriter, getKindMapping(module));
+  // Copy from (to) object to (from) temp copy of same object.
+  if (auto charTy = eleTy.dyn_cast<CharacterType>()) {
+    auto len = getCharacterLen(loc, builder, arrLoad, charTy);
+    CharBoxValue toChar(toAddr, len);
+    CharBoxValue fromChar(fromAddr, len);
+    factory::genScalarAssignment(builder, loc, toChar, fromChar);
+  } else {
+    if (hasDynamicSize(eleTy))
+      TODO(loc, "copy element of dynamic size");
+    factory::genScalarAssignment(builder, loc, toAddr, fromAddr);
+  }
+  rewriter.restoreInsertionPoint(insPt);
+}
+
+/// The array load may be either a boxed or unboxed value. If the value is
+/// boxed, we read the type parameters from the boxed value.
+static llvm::SmallVector<mlir::Value>
+genArrayLoadTypeParameters(mlir::Location loc, mlir::PatternRewriter &rewriter,
+                           ArrayLoadOp load) {
+  if (load.getTypeparams().empty()) {
+    auto eleTy =
+        unwrapSequenceType(unwrapPassByRefType(load.getMemref().getType()));
+    if (hasDynamicSize(eleTy)) {
+      if (auto charTy = eleTy.dyn_cast<CharacterType>()) {
+        assert(load.getMemref().getType().isa<BoxType>());
+        auto module = load->getParentOfType<mlir::ModuleOp>();
+        FirOpBuilder builder(rewriter, getKindMapping(module));
+        return {getCharacterLen(loc, builder, load, charTy)};
+      }
+      TODO(loc, "unhandled dynamic type parameters");
+    }
+    return {};
+  }
+  return load.getTypeparams();
+}
+
+static llvm::SmallVector<mlir::Value>
+findNonconstantExtents(mlir::Type memrefTy,
+                       llvm::ArrayRef<mlir::Value> extents) {
+  llvm::SmallVector<mlir::Value> nce;
+  auto arrTy = unwrapPassByRefType(memrefTy);
+  auto seqTy = arrTy.cast<SequenceType>();
+  for (auto [s, x] : llvm::zip(seqTy.getShape(), extents))
+    if (s == SequenceType::getUnknownExtent())
+      nce.emplace_back(x);
+  if (extents.size() > seqTy.getShape().size())
+    for (auto x : extents.drop_front(seqTy.getShape().size()))
+      nce.emplace_back(x);
+  return nce;
+}
+
 namespace {
 /// Conversion of fir.array_update and fir.array_modify Ops.
 /// If there is a conflict for the update, then we need to perform a
@@ -590,60 +1015,68 @@
 template <typename ArrayOp>
 class ArrayUpdateConversionBase : public mlir::OpRewritePattern<ArrayOp> {
 public:
+  // TODO: Implement copy/swap semantics?
   explicit ArrayUpdateConversionBase(mlir::MLIRContext *ctx,
                                      const ArrayCopyAnalysis &a,
                                      const OperationUseMapT &m)
       : mlir::OpRewritePattern<ArrayOp>{ctx}, analysis{a}, useMap{m} {}
 
-  void genArrayCopy(mlir::Location loc, mlir::PatternRewriter &rewriter,
-                    mlir::Value dst, mlir::Value src, mlir::Value shapeOp,
-                    mlir::Type arrTy) const {
-    auto insPt = rewriter.saveInsertionPoint();
-    llvm::SmallVector<mlir::Value> indices;
+  /// The array_access, \p access, is to be to a cloned copy due to a potential
+  /// conflict. Uses copy-in/copy-out semantics and not copy/swap.
+  mlir::Value referenceToClone(mlir::Location loc,
+                               mlir::PatternRewriter &rewriter,
+                               ArrayOp access) const {
+    LLVM_DEBUG(llvm::dbgs()
+               << "generating copy-in/copy-out loops for " << access << '\n');
+    auto *op = access.getOperation();
+    auto *loadOp = useMap.lookup(op);
+    auto load = mlir::cast<ArrayLoadOp>(loadOp);
+    auto eleTy = access.getType();
+    rewriter.setInsertionPoint(loadOp);
+    // Copy in.
     llvm::SmallVector<mlir::Value> extents;
-    getExtents(extents, shapeOp);
-    // Build loop nest from column to row.
-    for (auto sh : llvm::reverse(extents)) {
-      auto idxTy = rewriter.getIndexType();
-      auto ubi = rewriter.create<fir::ConvertOp>(loc, idxTy, sh);
-      auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-      auto one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-      auto ub = rewriter.create<arith::SubIOp>(loc, idxTy, ubi, one);
-      auto loop = rewriter.create<fir::DoLoopOp>(loc, zero, ub, one);
-      rewriter.setInsertionPointToStart(loop.getBody());
-      indices.push_back(loop.getInductionVar());
-    }
-    // Reverse the indices so they are in column-major order.
-    std::reverse(indices.begin(), indices.end());
-    auto ty = getEleTy(arrTy);
-    auto fromAddr = rewriter.create<fir::ArrayCoorOp>(
-        loc, ty, src, shapeOp, mlir::Value{},
-        fir::factory::originateIndices(loc, rewriter, src.getType(), shapeOp,
-                                       indices),
-        mlir::ValueRange{});
-    auto load = rewriter.create<fir::LoadOp>(loc, fromAddr);
-    auto toAddr = rewriter.create<fir::ArrayCoorOp>(
-        loc, ty, dst, shapeOp, mlir::Value{},
-        fir::factory::originateIndices(loc, rewriter, dst.getType(), shapeOp,
-                                       indices),
-        mlir::ValueRange{});
-    rewriter.create<fir::StoreOp>(loc, load, toAddr);
-    rewriter.restoreInsertionPoint(insPt);
+    bool copyUsingSlice = false;
+    auto shapeOp = getOrReadExtentsAndShapeOp(loc, rewriter, load, extents,
+                                              copyUsingSlice);
+    llvm::SmallVector<mlir::Value> nonconstantExtents =
+        findNonconstantExtents(load.getMemref().getType(), extents);
+    auto allocmem = rewriter.create<AllocMemOp>(
+        loc, dyn_cast_ptrOrBoxEleTy(load.getMemref().getType()),
+        genArrayLoadTypeParameters(loc, rewriter, load), nonconstantExtents);
+    genArrayCopy</*copyIn=*/true>(load.getLoc(), rewriter, allocmem,
+                                  load.getMemref(), shapeOp, load.getSlice(),
+                                  load);
+    // Generate the reference for the access.
+    rewriter.setInsertionPoint(op);
+    auto coor =
+        genCoorOp(rewriter, loc, getEleTy(load.getType()), eleTy, allocmem,
+                  shapeOp, copyUsingSlice ? mlir::Value{} : load.getSlice(),
+                  access.getIndices(), load.getTypeparams(),
+                  access->hasAttr(factory::attrFortranArrayOffsets()));
+    // Copy out.
+    auto *storeOp = useMap.lookup(loadOp);
+    auto store = mlir::cast<ArrayMergeStoreOp>(storeOp);
+    rewriter.setInsertionPoint(storeOp);
+    // Copy out.
+    genArrayCopy</*copyIn=*/false>(store.getLoc(), rewriter, store.getMemref(),
+                                   allocmem, shapeOp, store.getSlice(), load);
+    rewriter.create<FreeMemOp>(loc, allocmem);
+    return coor;
   }
 
   /// Copy the RHS element into the LHS and insert copy-in/copy-out between a
   /// temp and the LHS if the analysis found potential overlaps between the RHS
-  /// and LHS arrays. The element copy generator must be provided through \p
+  /// and LHS arrays. The element copy generator must be provided in \p
   /// assignElement. \p update must be the ArrayUpdateOp or the ArrayModifyOp.
   /// Returns the address of the LHS element inside the loop and the LHS
   /// ArrayLoad result.
   std::pair<mlir::Value, mlir::Value>
   materializeAssignment(mlir::Location loc, mlir::PatternRewriter &rewriter,
                         ArrayOp update,
-                        llvm::function_ref<void(mlir::Value)> assignElement,
+                        const std::function<void(mlir::Value)> &assignElement,
                         mlir::Type lhsEltRefType) const {
     auto *op = update.getOperation();
-    mlir::Operation *loadOp = useMap.lookup(op);
+    auto *loadOp = useMap.lookup(op);
     auto load = mlir::cast<ArrayLoadOp>(loadOp);
     LLVM_DEBUG(llvm::outs() << "does " << load << " have a conflict?\n");
     if (analysis.hasPotentialConflict(loadOp)) {
@@ -654,25 +1087,31 @@
       rewriter.setInsertionPoint(loadOp);
       // Copy in.
       llvm::SmallVector<mlir::Value> extents;
-      mlir::Value shapeOp =
-          getOrReadExtentsAndShapeOp(loc, rewriter, load, extents);
+      bool copyUsingSlice = false;
+      auto shapeOp = getOrReadExtentsAndShapeOp(loc, rewriter, load, extents,
+                                                copyUsingSlice);
+      llvm::SmallVector<mlir::Value> nonconstantExtents =
+          findNonconstantExtents(load.getMemref().getType(), extents);
       auto allocmem = rewriter.create<AllocMemOp>(
           loc, dyn_cast_ptrOrBoxEleTy(load.getMemref().getType()),
-          load.getTypeparams(), extents);
-      genArrayCopy(load.getLoc(), rewriter, allocmem, load.getMemref(), shapeOp,
-                   load.getType());
+          genArrayLoadTypeParameters(loc, rewriter, load), nonconstantExtents);
+      genArrayCopy</*copyIn=*/true>(load.getLoc(), rewriter, allocmem,
+                                    load.getMemref(), shapeOp, load.getSlice(),
+                                    load);
       rewriter.setInsertionPoint(op);
-      mlir::Value coor = genCoorOp(
+      auto coor = genCoorOp(
           rewriter, loc, getEleTy(load.getType()), lhsEltRefType, allocmem,
-          shapeOp, load.getSlice(), update.getIndices(), load.getTypeparams(),
-          update->hasAttr(fir::factory::attrFortranArrayOffsets()));
+          shapeOp, copyUsingSlice ? mlir::Value{} : load.getSlice(),
+          update.getIndices(), load.getTypeparams(),
+          update->hasAttr(factory::attrFortranArrayOffsets()));
       assignElement(coor);
-      mlir::Operation *storeOp = useMap.lookup(loadOp);
+      auto *storeOp = useMap.lookup(loadOp);
       auto store = mlir::cast<ArrayMergeStoreOp>(storeOp);
       rewriter.setInsertionPoint(storeOp);
       // Copy out.
-      genArrayCopy(store.getLoc(), rewriter, store.getMemref(), allocmem,
-                   shapeOp, load.getType());
+      genArrayCopy</*copyIn=*/false>(store.getLoc(), rewriter,
+                                     store.getMemref(), allocmem, shapeOp,
+                                     store.getSlice(), load);
       rewriter.create<FreeMemOp>(loc, allocmem);
       return {coor, load.getResult()};
     }
@@ -681,15 +1120,15 @@
     LLVM_DEBUG(llvm::outs() << "No, conflict wasn't found\n");
     rewriter.setInsertionPoint(op);
     auto coorTy = getEleTy(load.getType());
-    mlir::Value coor = genCoorOp(
-        rewriter, loc, coorTy, lhsEltRefType, load.getMemref(), load.getShape(),
-        load.getSlice(), update.getIndices(), load.getTypeparams(),
-        update->hasAttr(fir::factory::attrFortranArrayOffsets()));
+    auto coor = genCoorOp(rewriter, loc, coorTy, lhsEltRefType,
+                          load.getMemref(), load.getShape(), load.getSlice(),
+                          update.getIndices(), load.getTypeparams(),
+                          update->hasAttr(factory::attrFortranArrayOffsets()));
     assignElement(coor);
     return {coor, load.getResult()};
   }
 
-private:
+protected:
   const ArrayCopyAnalysis &analysis;
   const OperationUseMapT &useMap;
 };
@@ -706,7 +1145,12 @@
                   mlir::PatternRewriter &rewriter) const override {
     auto loc = update.getLoc();
     auto assignElement = [&](mlir::Value coor) {
-      rewriter.create<fir::StoreOp>(loc, update.getMerge(), coor);
+      auto input = update.getMerge();
+      if (auto inEleTy = dyn_cast_ptrEleTy(input.getType())) {
+        emitFatalError(loc, "array_update on references not supported");
+      } else {
+        rewriter.create<fir::StoreOp>(loc, input, coor);
+      }
     };
     auto lhsEltRefType = toRefType(update.getMerge().getType());
     auto [_, lhsLoadResult] = materializeAssignment(
@@ -753,21 +1197,77 @@
     rewriter.setInsertionPoint(op);
     auto load = mlir::cast<ArrayLoadOp>(useMap.lookup(op));
     auto loc = fetch.getLoc();
-    mlir::Value coor =
+    auto coor =
         genCoorOp(rewriter, loc, getEleTy(load.getType()),
                   toRefType(fetch.getType()), load.getMemref(), load.getShape(),
                   load.getSlice(), fetch.getIndices(), load.getTypeparams(),
-                  fetch->hasAttr(fir::factory::attrFortranArrayOffsets()));
-    rewriter.replaceOpWithNewOp<fir::LoadOp>(fetch, coor);
+                  fetch->hasAttr(factory::attrFortranArrayOffsets()));
+    if (isa_ref_type(fetch.getType()))
+      rewriter.replaceOp(fetch, coor);
+    else
+      rewriter.replaceOpWithNewOp<fir::LoadOp>(fetch, coor);
     return mlir::success();
   }
 
 private:
   const OperationUseMapT &useMap;
 };
-} // namespace
 
-namespace {
+/// As array_access op is like an array_fetch op, except that it does not imply
+/// a load op. (It operates in the reference domain.)
+class ArrayAccessConversion : public ArrayUpdateConversionBase<ArrayAccessOp> {
+public:
+  explicit ArrayAccessConversion(mlir::MLIRContext *ctx,
+                                 const ArrayCopyAnalysis &a,
+                                 const OperationUseMapT &m)
+      : ArrayUpdateConversionBase{ctx, a, m} {}
+
+  mlir::LogicalResult
+  matchAndRewrite(ArrayAccessOp access,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto *op = access.getOperation();
+    auto loc = access.getLoc();
+    if (analysis.inAmendAccessSet(op)) {
+      // This array_access is associated with an array_amend and there is a
+      // conflict. Make a copy to store into.
+      auto result = referenceToClone(loc, rewriter, access);
+      access.replaceAllUsesWith(result);
+      rewriter.replaceOp(access, result);
+      return mlir::success();
+    }
+    rewriter.setInsertionPoint(op);
+    auto load = mlir::cast<ArrayLoadOp>(useMap.lookup(op));
+    auto coor = genCoorOp(rewriter, loc, getEleTy(load.getType()),
+                          toRefType(access.getType()), load.getMemref(),
+                          load.getShape(), load.getSlice(), access.getIndices(),
+                          load.getTypeparams(),
+                          access->hasAttr(factory::attrFortranArrayOffsets()));
+    rewriter.replaceOp(access, coor);
+    return mlir::success();
+  }
+};
+
+/// An array_amend op is a marker to record which array access is being used to
+/// update an array value. After this pass runs, an array_amend has no
+/// semantics. We rewrite these to undefined values here to remove them while
+/// preserving SSA form.
+class ArrayAmendConversion : public mlir::OpRewritePattern<ArrayAmendOp> {
+public:
+  explicit ArrayAmendConversion(mlir::MLIRContext *ctx)
+      : OpRewritePattern{ctx} {}
+
+  mlir::LogicalResult
+  matchAndRewrite(ArrayAmendOp amend,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto *op = amend.getOperation();
+    rewriter.setInsertionPoint(op);
+    auto loc = amend.getLoc();
+    auto undef = rewriter.create<UndefOp>(loc, amend.getType());
+    rewriter.replaceOp(amend, undef.getResult());
+    return mlir::success();
+  }
+};
+
 class ArrayValueCopyConverter
     : public ArrayValueCopyBase<ArrayValueCopyConverter> {
 public:
@@ -778,22 +1278,21 @@
     auto *context = &getContext();
 
     // Perform the conflict analysis.
-    auto &analysis = getAnalysis<ArrayCopyAnalysis>();
+    const auto &analysis = getAnalysis<ArrayCopyAnalysis>();
     const auto &useMap = analysis.getUseMap();
 
-    // Phase 1 is performing a rewrite on the array accesses. Once all the
-    // array accesses are rewritten we can go on phase 2.
-    // Phase 2 gets rid of the useless copy-in/copyout operations. The copy-in
-    // /copy-out refers the Fortran copy-in/copy-out semantics on statements.
     mlir::RewritePatternSet patterns1(context);
     patterns1.insert<ArrayFetchConversion>(context, useMap);
     patterns1.insert<ArrayUpdateConversion>(context, analysis, useMap);
     patterns1.insert<ArrayModifyConversion>(context, analysis, useMap);
+    patterns1.insert<ArrayAccessConversion>(context, analysis, useMap);
+    patterns1.insert<ArrayAmendConversion>(context);
     mlir::ConversionTarget target(*context);
-    target.addLegalDialect<
-        FIROpsDialect, mlir::scf::SCFDialect, mlir::arith::ArithmeticDialect,
-        mlir::cf::ControlFlowDialect, mlir::func::FuncDialect>();
-    target.addIllegalOp<ArrayFetchOp, ArrayUpdateOp, ArrayModifyOp>();
+    target.addLegalDialect<FIROpsDialect, mlir::scf::SCFDialect,
+                           mlir::arith::ArithmeticDialect,
+                           mlir::func::FuncDialect>();
+    target.addIllegalOp<ArrayAccessOp, ArrayAmendOp, ArrayFetchOp,
+                        ArrayUpdateOp, ArrayModifyOp>();
     // Rewrite the array fetch and array update ops.
     if (mlir::failed(
             mlir::applyPartialConversion(func, target, std::move(patterns1)))) {
diff --git a/flang/test/Fir/array-value-copy.fir b/flang/test/Fir/array-value-copy.fir
--- a/flang/test/Fir/array-value-copy.fir
+++ b/flang/test/Fir/array-value-copy.fir
@@ -104,12 +104,12 @@
 // CHECK-LABEL: func @conversion_with_temporary(
 // CHECK-SAME:                                  %[[ARR0:.*]]: !fir.ref<!fir.array<10xi32>>)
 // Allocation of temporary array.
-// CHECK:         %[[TEMP:.*]] = fir.allocmem !fir.array<10xi32>, %{{.*}}
+// CHECK:         %[[TEMP:.*]] = fir.allocmem !fir.array<10xi32>
 // Copy of original array to temp.
 // CHECK:         fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
 // CHECK:           %[[COOR0:.*]] = fir.array_coor %[[ARR0]](%{{.*}}) %{{.*}} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
-// CHECK:           %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<i32>
 // CHECK:           %[[COOR1:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %{{.*}} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+// CHECK:           %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<i32>
 // CHECK:           fir.store %[[LOAD0]] to %[[COOR1]] : !fir.ref<i32>
 // CHECK:         }
 // Perform the assignment i = i(10:1:-1) using the temporary array.
@@ -125,8 +125,8 @@
 // Copy the result back to the original array.
 // CHECK:         fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
 // CHECK:           %[[COOR0:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %{{.*}} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
-// CHECK:           %[[LOAD0:.*]] = fir.load %[[COOR0:.*]] : !fir.ref<i32>
 // CHECK:           %[[COOR1:.*]] = fir.array_coor %[[ARR0]](%{{.*}}) %{{.*}} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+// CHECK:           %[[LOAD0:.*]] = fir.load %[[COOR0:.*]] : !fir.ref<i32>
 // CHECK:           fir.store %[[LOAD0]] to %[[COOR1]] : !fir.ref<i32>
 // CHECK:         }
 // Free temporary array.
@@ -176,7 +176,7 @@
 // CHECK-SAME:                                           %[[ARR0:.*]]: !fir.ref<!fir.array<10x5xi32>>) {
 // CHECK:         %[[CST10:.*]] = arith.constant 10 : index
 // CHECK:         %[[CST5:.*]] = arith.constant 5 : index
-// CHECK:         %[[TEMP:.*]] = fir.allocmem !fir.array<10x5xi32>, %c10, %c5
+// CHECK:         %[[TEMP:.*]] = fir.allocmem !fir.array<10x5xi32>
 // CHECK:         %[[IDX5:.*]] = fir.convert %[[CST5]] : (index) -> index
 // CHECK:         %[[UB5:.*]] = arith.subi %[[IDX5]], %{{.*}} : index
 // CHECK:         fir.do_loop %[[INDUC0:.*]] = %{{.*}} to %[[UB5]] step %{{.*}} {
@@ -186,8 +186,8 @@
 // CHECK:             %[[IDX1:.*]] = arith.addi %[[INDUC1]], %{{.*}} : index
 // CHECK:             %[[IDX2:.*]] = arith.addi %[[INDUC0]], %{{.*}} : index
 // CHECK:             %[[COOR0:.*]] = fir.array_coor %[[ARR0]](%{{.*}}) %[[IDX1:.*]], %[[IDX2:.*]] : (!fir.ref<!fir.array<10x5xi32>>, !fir.shape<2>, index, index) -> !fir.ref<i32>
-// CHECK:             %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<i32>
 // CHECK:             %[[COOR1:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %{{.*}}, %{{.*}} : (!fir.heap<!fir.array<10x5xi32>>, !fir.shape<2>, index, index) -> !fir.ref<i32>
+// CHECK:             %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<i32>
 // CHECK:             fir.store %[[LOAD0]] to %[[COOR1]] : !fir.ref<i32>
 // CHECK:         %{{.*}} = fir.do_loop %[[INDUC0:.*]] = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%{{.*}} = %{{.*}}) -> (!fir.array<10x5xi32>) {
 // CHECK:           %{{.*}} = fir.do_loop %[[INDUC1:.*]] = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%{{.*}} = %{{.*}}) -> (!fir.array<10x5xi32>) {
@@ -208,8 +208,8 @@
 // CHECK:             %[[IDX1:.*]] = arith.addi %[[INDUC1]], %{{.*}} : index
 // CHECK:             %[[IDX2:.*]] = arith.addi %[[INDUC0]], %{{.*}} : index  
 // CHECK:             %[[COOR0:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %[[IDX1]], %[[IDX2]] : (!fir.heap<!fir.array<10x5xi32>>, !fir.shape<2>, index, index) -> !fir.ref<i32>
-// CHECK:             %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<i32>
 // CHECK:             %[[COOR1:.*]] = fir.array_coor %[[ARR0]](%{{.*}}) %{{.*}}, %{{.*}} : (!fir.ref<!fir.array<10x5xi32>>, !fir.shape<2>, index, index) -> !fir.ref<i32>
+// CHECK:             %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<i32>
 // CHECK:             fir.store %[[LOAD0]] to %[[COOR1]] : !fir.ref<i32>
 // CHECK:         fir.freemem %[[TEMP]] : <!fir.array<10x5xi32>>
 
@@ -283,12 +283,12 @@
 // CHECK-SAME:                               %[[ARR0:.*]]: !fir.ref<!fir.array<100xf32>>) {
 // CHECK:           %[[VAR0:.*]] = fir.alloca f32
 // Allocate the temporary array.
-// CHECK:           %[[TEMP:.*]] = fir.allocmem !fir.array<100xf32>, %{{.*}}
+// CHECK:           %[[TEMP:.*]] = fir.allocmem !fir.array<100xf32>
 // Copy original array to temp.
 // CHECK:           fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
 // CHECK:             %[[COOR0:.*]] = fir.array_coor %[[ARR0]](%{{.*}}) %{{.*}} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
-// CHECK:             %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<f32>
 // CHECK:             %[[COOR1:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %{{.*}} : (!fir.heap<!fir.array<100xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:             %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<f32>
 // CHECK:             fir.store %[[LOAD0]] to %[[COOR1]] : !fir.ref<f32>
 // CHECK:           }
 // CHECK:           %[[VAL_21:.*]] = fir.undefined !fir.array<100xf32>
@@ -304,8 +304,8 @@
 // Copy back result to original array from temp.
 // CHECK:           fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
 // CHECK:             %[[COOR0:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %{{.*}} : (!fir.heap<!fir.array<100xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
-// CHECK:             %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<f32>
 // CHECK:             %[[COOR1:.*]] = fir.array_coor %[[ARR0]](%{{.*}}) %{{.*}} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:             %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<f32>
 // CHECK:             fir.store %[[LOAD0]] to %[[COOR1]] : !fir.ref<f32>
 // CHECK:           }
 // Free the temporary array.
@@ -374,8 +374,9 @@
 // Test fir.array_load/boxed array
 func @conversion_with_temporary_boxed_array(%arr0 : !fir.box<!fir.array<10xi32>>) {
   %c10 = arith.constant 10 : index
-  %1 = fir.shape %c10 : (index) -> !fir.shape<1>
-  %2 = fir.array_load %arr0(%1) : (!fir.box<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.array<10xi32>
+  %1:3 = fir.box_dims %arr0, %c10 : (!fir.box<!fir.array<10xi32>>, index) -> (index, index, index)
+  %shift = fir.shift %1#0 : (index) -> !fir.shift<1>
+  %2 = fir.array_load %arr0(%shift) : (!fir.box<!fir.array<10xi32>>, !fir.shift<1>) -> !fir.array<10xi32>
   %c10_i64 = arith.constant 10 : i64
   %3 = fir.convert %c10_i64 : (i64) -> index
   %c1_i64 = arith.constant 1 : i64
@@ -398,12 +399,12 @@
 // CHECK-LABEL: func @conversion_with_temporary_boxed_array(
 // CHECK-SAME:                                              %[[ARR0:.*]]: !fir.box<!fir.array<10xi32>>)
 // Allocation of temporary array.
-// CHECK:         %[[TEMP:.*]] = fir.allocmem !fir.array<10xi32>, %{{.*}}
+// CHECK:         %[[TEMP:.*]] = fir.allocmem !fir.array<10xi32>
 // Copy of original array to temp.
 // CHECK:         fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
-// CHECK:           %[[COOR0:.*]] = fir.array_coor %[[ARR0]](%{{.*}}) %{{.*}} : (!fir.box<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+// CHECK:           %[[COOR0:.*]] = fir.array_coor %[[ARR0]](%{{.*}}) %{{.*}} : (!fir.box<!fir.array<10xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+// CHECK:           %[[COOR1:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %{{.*}} : (!fir.heap<!fir.array<10xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
 // CHECK:           %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<i32>
-// CHECK:           %[[COOR1:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %{{.*}} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
 // CHECK:           fir.store %[[LOAD0]] to %[[COOR1]] : !fir.ref<i32>
 // CHECK:         }
 // Perform the assignment i = i(10:1:-1) using the temporary array.
@@ -412,15 +413,15 @@
 // CHECK-NOT:       %{{.*}} = fir.update
 // CHECK:           %[[COOR0:.*]] = fir.array_coor %[[ARR0]](%{{.*}}) [%{{.*}}] %{{.*}} : (!fir.box<!fir.array<10xi32>>, !fir.shape<1>, !fir.slice<1>, index) -> !fir.ref<i32>
 // CHECK:           %[[LOAD0:.*]] = fir.load %[[COOR0]] : !fir.ref<i32>
-// CHECK:           %[[COOR1:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %{{.*}} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+// CHECK:           %[[COOR1:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %{{.*}} : (!fir.heap<!fir.array<10xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
 // CHECK:           fir.store %[[LOAD0]] to %[[COOR1]] : !fir.ref<i32>
 // CHECK:           fir.result %{{.*}} : !fir.array<10xi32>
 // CHECK:         }
 // Copy the result back to the original array.
 // CHECK:         fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
-// CHECK:           %[[COOR0:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %{{.*}} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+// CHECK:           %[[COOR0:.*]] = fir.array_coor %[[TEMP]](%{{.*}}) %{{.*}} : (!fir.heap<!fir.array<10xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+// CHECK:           %[[COOR1:.*]] = fir.array_coor %[[ARR0]](%{{.*}}) %{{.*}} : (!fir.box<!fir.array<10xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
 // CHECK:           %[[LOAD0:.*]] = fir.load %[[COOR0:.*]] : !fir.ref<i32>
-// CHECK:           %[[COOR1:.*]] = fir.array_coor %[[ARR0]](%{{.*}}) %{{.*}} : (!fir.box<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
 // CHECK:           fir.store %[[LOAD0]] to %[[COOR1]] : !fir.ref<i32>
 // CHECK:         }
 // Free temporary array.