Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8830,6 +8830,14 @@
         auto OpRange = Plan->mapToVPValues(Instr->operands());
         Operands = {OpRange.begin(), OpRange.end()};
       }
+
+      // Invariant stores inside loop will be deleted and a single store
+      // with the final reduction value will be added to the exit block
+      StoreInst *SI;
+      if ((SI = dyn_cast<StoreInst>(&I)) &&
+          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+        continue;
+
       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
               Instr, Operands, Range, Plan)) {
         // If Instr can be simplified to an existing VPValue, use it.
@@ -8865,13 +8873,6 @@
         continue;
       }
 
-      // Invariant stores inside loop will be deleted and a single store
-      // with the final reduction value will be added to the exit block
-      StoreInst *SI;
-      if ((SI = dyn_cast<StoreInst>(&I)) &&
-          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
-        continue;
-
       // Otherwise, if all widening options failed, Instruction is to be
       // replicated. This may create a successor for VPBB.
       VPBasicBlock *NextVPBB =
Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -loop-vectorize -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize \
+; RUN:   -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve,+bf16 -S 2>%t | FileCheck %s -check-prefix=CHECK
+; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARK
+
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 1)
+define void @invariant_store_red_exit_is_phi(i32* %dst, i32* readonly %src, i64 %n) {
+; CHECK-LABEL: @invariant_store_red_exit_is_phi(
+; CHECK: vector.body:
+; CHECK:      %[[VEC_PHI:.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %[[PREDPHI:.*]], %vector.body ]
+; CHECK:      %[[ACTIVE_LANE_MASK:.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 {{%.*}}, i64 %n)
+; CHECK:      %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32
+; CHECK:      %[[ICMP:.*]] = icmp eq <vscale x 4 x i32> %[[LOAD]], zeroinitializer
+; CHECK:      %[[PRED1:.*]] = xor <vscale x 4 x i1> %[[ICMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: %[[PRED2:.*]] = select <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x i1> %[[PRED1]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT: %[[PREDPHI]] = select <vscale x 4 x i1> %[[PRED2]], <vscale x 4 x i32> {{%.*}}, <vscale x 4 x i32> {{%.*}}
+; CHECK-NEXT: %[[SELECT:.*]] = select <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x i32> %[[PREDPHI]], <vscale x 4 x i32> %[[VEC_PHI]]
+; CHECK: middle.block:
+; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[SELECT]])
+; CHECK-NEXT: store i32 %[[SUM]], i32* %dst, align 4
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %red = phi i32 [ 0, %entry ], [ %storemerge, %for.inc ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %load = load i32, i32* %arrayidx6, align 4
+  %tobool7.not = icmp eq i32 %load, 0
+  br i1 %tobool7.not, label %if.else, label %if.then8
+
+if.then8:                                         ; preds = %for.body
+  %add = add nsw i32 %red, 91
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %add19 = add nsw i32 %red, 3
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then8, %if.else
+  %storemerge = phi i32 [ %add19, %if.else ], [ %add, %if.then8 ]
+  store i32 %storemerge, i32* %dst, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !0
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %if.then4
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -328,8 +328,6 @@
 ; CHECK: %[[LOAD2:.*]] = load <vscale x 4 x i32>
 ; CHECK: %[[ADD1:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD1]]
 ; CHECK: %[[ADD2:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD2]]
-; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD1]]
-; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD2]]
 ; CHECK: middle.block:
 ; CHECK: %[[ADD:.*]] = add <vscale x 4 x i32> %[[ADD2]], %[[ADD1]]
 ; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[ADD]])