diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5252,24 +5252,13 @@
   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
     addToWorklistIfAllowed(Cmp);
 
-  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
-  // are pointers that are treated like consecutive pointers during
-  // vectorization. The pointer operands of interleaved accesses are an
-  // example.
-  SmallSetVector<Value *, 8> ConsecutiveLikePtrs;
-
-  // Holds pointer operands of instructions that are possibly non-uniform.
-  SmallPtrSet<Value *, 8> PossibleNonUniformPtrs;
-
   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
     InstWidening WideningDecision = getWideningDecision(I, VF);
     assert(WideningDecision != CM_Unknown &&
            "Widening decision should be ready at this moment");
 
-    // The address of a uniform mem op is itself uniform.  We exclude stores
-    // here as there's an assumption in the current code that all uses of
-    // uniform instructions are uniform and, as noted below, uniform stores are
-    // still handled via replication (i.e. aren't uniform after vectorization).
+    // A uniform memory op is itself uniform.  We exclude uniform stores
+    // here as they demand the last lane, not the first one.
     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
       assert(WideningDecision == CM_Scalarize);
       return true;
@@ -5287,14 +5276,15 @@
     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
   };
   
-  // Iterate over the instructions in the loop, and collect all
-  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
-  // that a consecutive-like pointer operand will be scalarized, we collect it
-  // in PossibleNonUniformPtrs instead. We use two sets here because a single
-  // getelementptr instruction can be used by both vectorized and scalarized
-  // memory instructions. For example, if a loop loads and stores from the same
-  // location, but the store is conditional, the store will be scalarized, and
-  // the getelementptr won't remain uniform.
+  // Holds a list of values which are known to have at least one uniform use.
+  // Note that there may be other uses which aren't uniform.  A "uniform use"
+  // here is something which only demands lane 0 of the unrolled iterations;
+  // it does not imply that all lanes produce the same value (e.g. this is not 
+  // the usual meaning of uniform)
+  SmallPtrSet<Value *, 8> HasUniformUse;
+
+  // Scan the loop for instructions which are either a) known to have only
+  // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
       // If there's no pointer operand, there's nothing to do.
@@ -5302,45 +5292,31 @@
       if (!Ptr)
         continue;
 
-      // For now, avoid walking use lists in other functions.
-      // TODO: Rewrite this algorithm from uses up.
-      if (!isa<Instruction>(Ptr) && !isa<Argument>(Ptr))
-        continue;
-
-      // A uniform memory op is itself uniform.  We exclude stores here as we
-      // haven't yet added dedicated logic in the CLONE path and rely on
-      // REPLICATE + DSE for correctness.
+      // A uniform memory op is itself uniform.  We exclude uniform stores
+      // here as they demand the last lane, not the first one.
       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
         addToWorklistIfAllowed(&I);
 
-      // True if all users of Ptr are memory accesses that have Ptr as their
-      // pointer operand.  Since loops are assumed to be in LCSSA form, this
-      // disallows uses outside the loop as well.
-      auto UsersAreMemAccesses =
-          llvm::all_of(Ptr->users(), [&](User *U) -> bool {
-            return getLoadStorePointerOperand(U) == Ptr;
-          });
-
-      // Ensure the memory instruction will not be scalarized or used by
-      // gather/scatter, making its pointer operand non-uniform. If the pointer
-      // operand is used by any instruction other than a memory access, we
-      // conservatively assume the pointer operand may be non-uniform.
-      if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
-        PossibleNonUniformPtrs.insert(Ptr);
-
-      // If the memory instruction will be vectorized and its pointer operand
-      // is consecutive-like, or interleaving - the pointer operand should
-      // remain uniform.
-      else
-        ConsecutiveLikePtrs.insert(Ptr);
+      if (isUniformDecision(&I, VF)) {
+        assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
+        HasUniformUse.insert(Ptr);
+      }
     }
 
-  // Add to the Worklist all consecutive and consecutive-like pointers that
-  // aren't also identified as possibly non-uniform.
-  for (auto *V : ConsecutiveLikePtrs)
-    if (!PossibleNonUniformPtrs.count(V))
-      if (auto *I = dyn_cast<Instruction>(V))
-        addToWorklistIfAllowed(I);
+  // Add to the worklist any operands which have *only* uniform (e.g. lane 0
+  // demanding) users.  Since loops are assumed to be in LCSSA form, this
+  // disallows uses outside the loop as well.
+  for (auto *V : HasUniformUse) {
+    if (isOutOfScope(V))
+      continue;
+    auto *I = cast<Instruction>(V);
+    auto UsersAreMemAccesses =
+      llvm::all_of(I->users(), [&](User *U) -> bool {
+        return isVectorizedMemAccessUse(cast<Instruction>(U), V);
+      });
+    if (UsersAreMemAccesses)
+      addToWorklistIfAllowed(I);
+  }
 
   // Expand Worklist in topological order: whenever a new instruction
   // is added , its users should be already inside Worklist.  It ensures
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
@@ -20,37 +20,58 @@
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> undef, i8 [[X:%.*]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i8> undef, i8 [[X]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT2]], <4 x i8> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* null, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[P:%.*]], align 1, [[TBAA1:!tbaa !.*]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw <4 x i32> [[TMP5]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* undef, align 1, [[TBAA1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* undef, align 1, [[TBAA1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* undef, align 1, [[TBAA1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* undef, align 1, [[TBAA1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP13]], i32 0
-; CHECK-NEXT:    store i32 [[TMP14]], i32* undef, align 4, [[TBAA4:!tbaa !.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i32 1
-; CHECK-NEXT:    store i32 [[TMP15]], i32* undef, align 4, [[TBAA4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i32 2
-; CHECK-NEXT:    store i32 [[TMP16]], i32* undef, align 4, [[TBAA4]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
-; CHECK-NEXT:    store i32 [[TMP17]], i32* undef, align 4, [[TBAA4]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, i8* null, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw <4 x i32> [[TMP4]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw <4 x i32> [[TMP5]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[P:%.*]], align 1, [[TBAA1:!tbaa !.*]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i8> undef, i8 [[TMP8]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT4]], <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[P]], align 1, [[TBAA1]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i8> undef, i8 [[TMP9]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT6]], <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw <4 x i32> [[TMP11]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP12]], [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP13]], [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, i8* undef, align 1, [[TBAA1]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* undef, align 1, [[TBAA1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP20]], i32 0
+; CHECK-NEXT:    store i32 [[TMP22]], i32* undef, align 4, [[TBAA4:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP20]], i32 1
+; CHECK-NEXT:    store i32 [[TMP23]], i32* undef, align 4, [[TBAA4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP20]], i32 2
+; CHECK-NEXT:    store i32 [[TMP24]], i32* undef, align 4, [[TBAA4]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP20]], i32 3
+; CHECK-NEXT:    store i32 [[TMP25]], i32* undef, align 4, [[TBAA4]]
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP21]], i32 0
+; CHECK-NEXT:    store i32 [[TMP26]], i32* undef, align 4, [[TBAA4]]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP21]], i32 1
+; CHECK-NEXT:    store i32 [[TMP27]], i32* undef, align 4, [[TBAA4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP21]], i32 2
+; CHECK-NEXT:    store i32 [[TMP28]], i32* undef, align 4, [[TBAA4]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP21]], i32 3
+; CHECK-NEXT:    store i32 [[TMP29]], i32* undef, align 4, [[TBAA4]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1, 0
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]]
@@ -61,11 +82,11 @@
 ; CHECK-NEXT:    [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[CONV70:%.*]] = zext i8 [[X]] to i32
 ; CHECK-NEXT:    [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24
-; CHECK-NEXT:    [[TMP19:%.*]] = load i8, i8* [[P]], align 1, [[TBAA1]]
-; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP19]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, i8* [[P]], align 1, [[TBAA1]]
+; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP31]] to i32
 ; CHECK-NEXT:    [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
 ; CHECK-NEXT:    [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i8, i8* undef, align 1, [[TBAA1]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, i8* undef, align 1, [[TBAA1]]
 ; CHECK-NEXT:    [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
 ; CHECK-NEXT:    [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
 ; CHECK-NEXT:    [[CONV81:%.*]] = zext i8 undef to i32
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
@@ -593,3 +593,155 @@
 loop_exit:
   ret i32 %accum.next
 }
+
+;; Same as uniform_load, but show that the uniformity analysis can handle
+;; pointer operands which are not local to the function.
+@GAddr = external global i32 align 4
+define i32 @uniform_load_global() {
+; CHECK-LABEL: @uniform_load_global(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
+; CHECK-NEXT:    [[TMP10]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP21:!llvm.loop !.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4097, 4096
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], [[LOOP22:!llvm.loop !.*]]
+; CHECK:       loopexit:
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %accum = phi i32 [%accum.next, %for.body], [0, %entry]
+  %load = load i32, i32* @GAddr
+  %accum.next = add i32 %accum, %load
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv, 4096
+  br i1 %exitcond, label %loopexit, label %for.body
+
+loopexit:
+  ret i32 %accum.next
+}
+
+;; Same as the global case, but using a constexpr
+define i32 @uniform_load_constexpr() {
+; CHECK-LABEL: @uniform_load_constexpr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
+; CHECK-NEXT:    [[TMP10]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP23:!llvm.loop !.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4097, 4096
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], [[LOOP24:!llvm.loop !.*]]
+; CHECK:       loopexit:
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %accum = phi i32 [ %accum.next, %for.body ], [ 0, %entry ]
+  %load = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+  %accum.next = add i32 %accum, %load
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv, 4096
+  br i1 %exitcond, label %loopexit, label %for.body
+
+loopexit:                                         ; preds = %for.body
+  ret i32 %accum.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
--- a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
@@ -15,7 +15,7 @@
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE4:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 99, [[TMP0]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> undef, i16 [[OFFSET_IDX]], i32 0
@@ -23,36 +23,35 @@
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], <i16 0, i16 1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* @v_38, align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, i16* @v_38, align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i16> undef, i16 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i16> [[TMP4]], i16 [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i16> [[TMP5]], <i16 32767, i16 32767>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <2 x i16> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_SREM_IF:%.*]], label [[PRED_SREM_CONTINUE:%.*]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i16> undef, i16 [[TMP2]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT1]], <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i16> [[BROADCAST_SPLAT2]], <i16 32767, i16 32767>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i16> [[BROADCAST_SPLAT2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_SREM_IF:%.*]], label [[PRED_SREM_CONTINUE:%.*]]
 ; CHECK:       pred.srem.if:
-; CHECK-NEXT:    [[TMP10:%.*]] = srem i16 5786, [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i16> undef, i16 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = srem i16 5786, [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i16> undef, i16 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label [[PRED_SREM_CONTINUE]]
 ; CHECK:       pred.srem.continue:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i16> [ undef, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_SREM_IF]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_SREM_IF1:%.*]], label [[PRED_SREM_CONTINUE2]]
-; CHECK:       pred.srem.if1:
-; CHECK-NEXT:    [[TMP14:%.*]] = srem i16 5786, [[TMP3]]
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i16> [[TMP12]], i16 [[TMP14]], i32 1
-; CHECK-NEXT:    br label [[PRED_SREM_CONTINUE2]]
-; CHECK:       pred.srem.continue2:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i16> [ [[TMP12]], [[PRED_SREM_CONTINUE]] ], [ [[TMP15]], [[PRED_SREM_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> <i16 5786, i16 5786>, <2 x i16> [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
-; CHECK-NEXT:    store i16 [[TMP17]], i16* @v_39, align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
-; CHECK-NEXT:    store i16 [[TMP18]], i16* @v_39, align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i16> [ undef, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_SREM_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_SREM_IF3:%.*]], label [[PRED_SREM_CONTINUE4]]
+; CHECK:       pred.srem.if3:
+; CHECK-NEXT:    [[TMP11:%.*]] = srem i16 5786, [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i16> [[TMP9]], i16 [[TMP11]], i32 1
+; CHECK-NEXT:    br label [[PRED_SREM_CONTINUE4]]
+; CHECK:       pred.srem.continue4:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i16> [ [[TMP9]], [[PRED_SREM_CONTINUE]] ], [ [[TMP12]], [[PRED_SREM_IF3]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i16> <i16 5786, i16 5786>, <2 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
+; CHECK-NEXT:    store i16 [[TMP14]], i16* @v_39, align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
+; CHECK-NEXT:    store i16 [[TMP15]], i16* @v_39, align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 12, 12
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -75,7 +74,7 @@
 ; CHECK-NEXT:    store i16 [[COND6]], i16* @v_39, align 1
 ; CHECK-NEXT:    [[INC7]] = add nsw i16 [[I_07]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[INC7]], 111
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop !2
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[RV:%.*]] = load i16, i16* @v_39, align 1
 ; CHECK-NEXT:    ret i16 [[RV]]