diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1509,14 +1509,17 @@
   // Returns true if \p I is an instruction that will be predicated either
   // through scalar predication or masked load/store or masked gather/scatter.
   // Superset of instructions that return true for isScalarWithPredication.
-  bool isPredicatedInst(Instruction *I) {
+  // Optional parameter \p VF is unused by this function, and passed directly to
+  // isScalarWithPredication in the fall-back case.
+  bool isPredicatedInst(Instruction *I,
+                        ElementCount VF = ElementCount::getFixed(1)) {
     if (!blockNeedsPredication(I->getParent()))
       return false;
     // Loads and stores that need some form of masked operation are predicated
     // instructions.
     if (isa<LoadInst>(I) || isa<StoreInst>(I))
       return Legal->isMaskRequired(I);
-    return isScalarWithPredication(I);
+    return isScalarWithPredication(I, VF);
   }
 
   /// Returns true if \p I is a memory instruction with consecutive memory
@@ -8594,8 +8597,7 @@
       Range);
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
-      Range);
+      [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); }, Range);
 
   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
                                        IsUniform, IsPredicated);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -dce -instcombine \
+; RUN:     -debug-only=loop-vectorize \
+; RUN:     -S -o - 2>%t < %s | FileCheck %s
+; RUN: FileCheck --check-prefix=DBG %s < %t
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; IR generated from (approximately):
+;
+; 1   void foo(int *restrict data1, int *restrict data2)
+; 2   {
+; 3     int counter = 1024;
+; 4     while (counter--)
+; 5       if (data1[counter] > data2[counter])
+; 6         data1[counter] = data2[counter];
+; 7   }
+;
+
+define void @foo(i32* %data1, i32* %data2) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 1023, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DATA2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret void
+;
+; DBG:      LV: Scalarizing:  %arrayidx = getelementptr inbounds i32, i32* %data1, i64 %indvars.iv
+; DBG-NEXT: LV: Scalarizing:  %0 = load i32, i32* %arrayidx, align 4
+; DBG-NEXT: LV: Scalarizing:  %arrayidx2 = getelementptr inbounds i32, i32* %data2, i64 %indvars.iv
+; DBG-NEXT: LV: Scalarizing:  %1 = load i32, i32* %arrayidx2, align 4
+; DBG-NEXT: LV: Scalarizing:  %cmp = icmp sgt i32 %0, %1
+; DBG-NEXT: LV: Scalarizing and predicating:  store i32 %1, i32* %arrayidx, align 4
+; DBG-NEXT: LV: Scalarizing:  %arrayidx = getelementptr inbounds i32, i32* %data1, i64 %indvars.iv
+; DBG-NEXT: LV: Scalarizing:  %arrayidx2 = getelementptr inbounds i32, i32* %data2, i64 %indvars.iv
+entry:
+  br label %while.body
+
+while.body:
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %data1, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %data2, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %1, i32* %arrayidx, align 4
+  br label %if.end
+
+if.end:
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %tobool.not = icmp eq i64 %indvars.iv, 0
+  br i1 %tobool.not, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -65,9 +65,11 @@
 ; sink-scalar-operands optimization for predicated instructions.
 ;
 ; SINK-GATHER: vector.body:
+; SINK-GATHER: pred.load.if:
+; SINK-GATHER:   %{{.*}} = load i32, i32* %{{.*}}, align 4
+; SINK-GATHER: pred.load.continue:
 ; SINK-GATHER: pred.udiv.if:
-; SINK-GATHER:   %[[T0:.+]] = load i32, i32* %{{.*}}, align 4
-; SINK-GATHER:   %{{.*}} = udiv i32 %[[T0]], %{{.*}}
+; SINK-GATHER:   %{{.*}} = udiv i32 %{{.*}}, %{{.*}}
 ; SINK-GATHER: pred.udiv.continue:
 define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 entry: