diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5123,13 +5123,12 @@
   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
     if (isScalarPtrInduction(MemAccess, Ptr)) {
       Worklist.insert(cast<Instruction>(Ptr));
-      Instruction *Update = cast<Instruction>(
-          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
-      Worklist.insert(Update);
       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
                         << "\n");
-      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
-                        << "\n");
+
+      Instruction *Update = cast<Instruction>(
+          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
+      ScalarPtrs.insert(Update);
       return;
     }
     // We only care about bitcast and getelementptr instructions contained in
@@ -5143,11 +5142,12 @@
     if (Worklist.count(I))
       return;
 
-    // If the use of the pointer will be a scalar use, and all users of the
-    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
-    // place the pointer in PossibleNonScalarPtrs.
-    if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
-          return isa<LoadInst>(U) || isa<StoreInst>(U);
+    // If all users of the pointer will be memory accesses and scalar, place the
+    // pointer in ScalarPtrs. Otherwise, place the pointer in
+    // PossibleNonScalarPtrs.
+    if (llvm::all_of(I->users(), [&](User *U) {
+          return (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+                 isScalarUse(cast<Instruction>(U), Ptr);
         }))
       ScalarPtrs.insert(I);
     else
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -91,7 +91,7 @@
 ; Same as predicate_store except we use a pointer PHI to maintain the address
 ;
 ; CHECK: Found new scalar instruction:   %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
-; CHECK: Found new scalar instruction:   %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
+; CHECK: Found scalar instruction:   %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4
 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
 ; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %addr, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -0,0 +1,81 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -scalable-vectorization=on -S -mtriple=aarch64 -mattr=+sve -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; In the test below the PHI instruction:
+;  %0 = phi i8* [ %incdec.ptr190, %loop.body ], [ %src, %entry ]
+; has multiple uses, i.e.
+;  1. As a uniform address for the load, and
+;  2. Non-uniform use by the getelementptr + store, which leads to replication.
+
+; CHECK-LABEL:  LV: Checking a loop in "phi_multiple_use"
+; CHECK-NOT:    LV: Found new scalar instruction:   %incdec.ptr190 = getelementptr inbounds i8, i8* %0, i64 1
+;
+; CHECK:        VPlan 'Initial VPlan for VF={vscale x 2},UF>=1' {
+; CHECK-NEXT:   loop.body:
+; CHECK-NEXT:     WIDEN-INDUCTION %index = phi 0, %index.next
+; CHECK-NEXT:     WIDEN-PHI %curchar = phi %curchar.next, %curptr
+; CHECK-NEXT:     WIDEN-PHI %0 = phi %incdec.ptr190, %src
+; CHECK-NEXT:     WIDEN-GEP Var[Inv] ir<%incdec.ptr190> = getelementptr ir<%0>, ir<1>
+; CHECK-NEXT:     WIDEN store ir<%curchar>, ir<%incdec.ptr190>
+; CHECK-NEXT:     WIDEN ir<%1> = load ir<%0>
+; CHECK-NEXT:     WIDEN ir<%2> = add ir<%1>, ir<1>
+; CHECK-NEXT:     WIDEN store ir<%0>, ir<%2>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT:   }
+
+define void @phi_multiple_use(i8** noalias %curptr, i8* noalias %src, i64 %N) #0 {
+; CHECK-LABEL: @phi_multiple_use(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK-NEXT:    {{.*}} = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8*, i8** %curptr, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX1]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 0, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP3]]
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, i8* %src, <vscale x 2 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP6]], i64 1
+; CHECK:         store <vscale x 2 x i8*> [[TMP5]], <vscale x 2 x i8*>*
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 2 x i8*> [[NEXT_GEP6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <vscale x 2 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]],
+; CHECK:         store <vscale x 2 x i8> [[TMP9]], <vscale x 2 x i8>*
+
+entry:
+  br label %loop.body
+
+loop.body:                                    ; preds = %loop.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop.body ]
+  %curchar = phi i8** [ %curchar.next, %loop.body ], [ %curptr, %entry ]
+  %0 = phi i8* [ %incdec.ptr190, %loop.body ], [ %src, %entry ]
+  %incdec.ptr190 = getelementptr inbounds i8, i8* %0, i64 1
+  %curchar.next = getelementptr inbounds i8*, i8** %curchar, i64 1
+  store i8* %incdec.ptr190, i8** %curchar, align 8
+  %1 = load i8, i8* %0, align 1
+  %2 = add i8 %1, 1
+  store i8 %2, i8* %0, align 1
+  %index.next = add nuw i64 %index, 1
+  %3 = icmp ne i64 %index.next, %N
+  br i1 %3, label %loop.body, label %exit, !llvm.loop !0
+
+exit:                            ; preds = %loop.body
+  ret void
+}
+
+
+attributes #0 = {"target-features"="+sve"}
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.interleave.count", i32 1}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{ !5 }
+!5 = distinct !{ !5, !6 }
+!6 = distinct !{ !7 }
+!7 = distinct !{ !7, !6 }
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -146,26 +146,23 @@
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP4]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP5]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP6]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP7]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i8*> poison, i8* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i8*> [[TMP10]], i8* [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i8*> [[TMP11]], i8* [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i8*> [[TMP12]], i8* [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to <4 x i8*>*
-; CHECK-NEXT:    store <4 x i8*> [[TMP13]], <4 x i8*>* [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, i8* [[NEXT_GEP4]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP18]], <4 x i8>* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i8*> poison, i8* [[NEXT_GEP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i8*> [[TMP6]], i8* [[NEXT_GEP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i8*> [[TMP7]], i8* [[NEXT_GEP6]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i8*> [[TMP8]], i8* [[NEXT_GEP7]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP9]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to <4 x i8*>*
+; CHECK-NEXT:    store <4 x i8*> [[TMP10]], <4 x i8*>* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[NEXT_GEP4]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[TMP15]], <4 x i8>* [[TMP16]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]