diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -99,7 +99,17 @@
 
 namespace fir {
 
-static void defaultFlangInlinerOptPipeline(mlir::OpPassManager &pm) {
+/// Add MLIR Canonicalizer pass with region simplification disabled.
+/// FIR does not support the promotion of some SSA value to block arguments (or
+/// into arith.select operands) that may be done by mlir block merging in the
+/// region simplification (e.g., !fir.shape<> SSA values are not supported as
+/// block arguments).
+/// Aside from the fir.shape issue, moving some abstract SSA value into block
+/// arguments may have a heavy cost since it forces their code generation that
+/// may be expensive (array temporary). The MLIR pass does not take these
+/// extra costs into account when doing block merging.
+static void addCanonicalizerPassWithoutRegionSimplification(
+    mlir::OpPassManager &pm) {
   mlir::GreedyRewriteConfig config;
   config.enableRegionSimplification = false;
   pm.addPass(mlir::createCanonicalizerPass(config));
@@ -200,8 +210,8 @@
   // The default inliner pass adds the canonicalizer pass with the default
   // configuration. Create the inliner pass with tco config.
   llvm::StringMap<mlir::OpPassManager> pipelines;
-  pm.addPass(
-      mlir::createInlinerPass(pipelines, defaultFlangInlinerOptPipeline));
+  pm.addPass(mlir::createInlinerPass(
+      pipelines, addCanonicalizerPassWithoutRegionSimplification));
   pm.addPass(fir::createSimplifyRegionLitePass());
   pm.addPass(mlir::createCSEPass());
 
@@ -225,7 +235,7 @@
 inline void createHLFIRToFIRPassPipeline(
     mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel) {
   if (optLevel.isOptimizingForSpeed())
-    pm.addPass(mlir::createCanonicalizerPass());
+    addCanonicalizerPassWithoutRegionSimplification(pm);
   pm.addPass(hlfir::createLowerHLFIRIntrinsicsPass());
   pm.addPass(hlfir::createBufferizeHLFIRPass());
   pm.addPass(hlfir::createConvertHLFIRtoFIRPass());
diff --git a/flang/test/HLFIR/no-block-merging.fir b/flang/test/HLFIR/no-block-merging.fir
new file mode 100644
--- /dev/null
+++ b/flang/test/HLFIR/no-block-merging.fir
@@ -0,0 +1,33 @@
+// Test that the HLFIR pipeline does not call MLIR canonicalizer with block
+// merging enabled (moving fir.shape to block argument would cause failures
+// when translating the FIR to LLVM).
+// RUN: %flang_fc1 %s -flang-experimental-hlfir -emit-llvm -O2 -o - | FileCheck %s
+
+func.func @no_shape_merge(%cdt: i1, %from: !fir.ref<!fir.array<?xf64>>, %to : !fir.ref<f64>) {
+  %c10 = arith.constant 10 : index
+  %c20 = arith.constant 20 : index
+  %c5 = arith.constant 5 : index
+  %shape1 = fir.shape %c10 : (index) -> !fir.shape<1>
+  %shape2 = fir.shape %c20 : (index) -> !fir.shape<1>
+  cf.cond_br %cdt, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %coor1 = fir.array_coor %from(%shape1) %c5 : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>, index) -> !fir.ref<f64>
+  %load1 = fir.load %coor1 : !fir.ref<f64>
+  fir.store %load1 to %to : !fir.ref<f64>
+  cf.br ^bb3
+^bb2:  // pred: ^bb0
+  %coor2 = fir.array_coor %from(%shape2) %c5 : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>, index) -> !fir.ref<f64>
+  %load2 = fir.load %coor2 : !fir.ref<f64>
+  fir.store %load2 to %to : !fir.ref<f64>
+  cf.br ^bb3
+^bb3:  // pred: ^bb1, ^bb2
+  return
+}
+
+// Note: block merging happens in the output below, but after FIR codegen.
+
+// CHECK-LABEL:  define void @no_shape_merge(
+// CHECK:  %[[GEP:.*]] = getelementptr double, ptr %{{.*}}
+// CHECK:  %[[LOAD:.*]] = load double, ptr %[[GEP]]
+// CHECK:  store double %[[LOAD]], ptr %{{.*}}
+// CHECK:  ret void