Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
===================================================================
--- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -289,6 +289,19 @@
   /// Returns true if the value V is uniform within the loop.
   bool isUniform(Value *V);
 
+  /// A uniform memory op is a load or store which accesses the same memory
+  /// location on all lanes.
+  bool isUniformMemOp(Instruction &I) {
+    Value *Ptr =  getLoadStorePointerOperand(&I);
+    if (!Ptr)
+      return false;
+    // Note: There's nothing inherent which prevents predicated loads and
+    // stores from being uniform.  The current lowering simply doesn't handle
+    // it; in particular, the cost model distinguishes scatter/gather from
+    // scalar w/predication, and we currently rely on the scalar path.
+    return isUniform(Ptr) && !blockNeedsPredication(I.getParent());
+  }
+
   /// Returns the information that we collected about runtime memory check.
   const RuntimePointerChecking *getRuntimePointerChecking() const {
     return LAI->getRuntimePointerChecking();
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5030,6 +5030,8 @@
   // replicating region where only a single instance out of VF should be formed.
   // TODO: optimize such seldom cases if found important, see PR40816.
   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
+    if (isOutOfScope(I))
+      return;
     if (isScalarWithPredication(I, VF)) {
       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
                         << *I << "\n");
@@ -5060,6 +5062,15 @@
     assert(WideningDecision != CM_Unknown &&
            "Widening decision should be ready at this moment");
 
+    // The address of a uniform mem op is itself uniform.  We exclude stores
+    // here as there's an assumption in the current code that all uses of
+    // uniform instructions are uniform and, as noted below, uniform stores are
+    // still handled via replication (i.e. aren't uniform after vectorization).
+    if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
+      assert(WideningDecision == CM_Scalarize);
+      return true;
+    }
+
     return (WideningDecision == CM_Widen ||
             WideningDecision == CM_Widen_Reverse ||
             WideningDecision == CM_Interleave);
@@ -5079,6 +5090,12 @@
       if (!Ptr)
         continue;
 
+      // A uniform memory op is itself uniform.  We exclude stores here as we
+      // haven't yet added dedicated logic in the CLONE path and rely on
+      // REPLICATE + DSE for correctness.
+      if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
+        addToWorklistIfAllowed(&I);
+
       // True if all users of Ptr are memory accesses that have Ptr as their
       // pointer operand.
       auto UsersAreMemAccesses =
@@ -6231,6 +6248,8 @@
 
 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                          ElementCount VF) {
+  assert(Legal->isUniformMemOp(*I));
+
   Type *ValTy = getMemInstValueType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   const Align Alignment = getLoadStoreAlignment(I);
@@ -6408,11 +6427,7 @@
       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
         NumPredStores++;
 
-      if (Legal->isUniform(Ptr) &&
-          // Conditional loads and stores should be scalarized and predicated.
-          // isScalarWithPredication cannot be used here since masked
-          // gather/scatters are not considered scalar with predication.
-          !Legal->blockNeedsPredication(I.getParent())) {
+      if (Legal->isUniformMemOp(I)) {
         // TODO: Avoid replicating loads and stores instead of
         // relying on instcombine to remove them.
         // Load: Scalar load + broadcast
Index: llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
+++ llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
@@ -68,24 +68,19 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !0
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !3
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !3
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !3
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !3
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4, !alias.scope !5, !noalias !7
-; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <4 x i32> [[TMP14]], [[WIDE_LOAD12]]
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* [[TMP19]], align 4, !alias.scope !5, !noalias !7
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !5, !noalias !7
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP7]], [[WIDE_LOAD12]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !alias.scope !5, !noalias !7
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[Z]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOTOUTER]], label [[SCALAR_PH]]
@@ -100,17 +95,17 @@
 ; CHECK-NEXT:    br i1 [[EXITCOND_OUTER]], label [[DOTEXIT:%.*]], label [[DOTOUTER_PREHEADER]]
 ; CHECK:       .inner:
 ; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[DOTINNER]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[J]]
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = add nsw i32 [[TMP23]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[J]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = add nsw i32 [[TMP24]], [[TMP26]]
-; CHECK-NEXT:    store i32 [[TMP27]], i32* [[TMP25]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[J]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[J]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    store i32 [[TMP20]], i32* [[TMP18]], align 4
 ; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 1
 ; CHECK-NEXT:    [[EXITCOND_INNER:%.*]] = icmp eq i64 [[J_NEXT]], [[Z]]
-; CHECK-NEXT:    br i1 [[EXITCOND_INNER]], label [[DOTOUTER]], label [[DOTINNER]], !llvm.loop !10
+; CHECK-NEXT:    br i1 [[EXITCOND_INNER]], label [[DOTOUTER]], label [[DOTINNER]], [[LOOP10:!llvm.loop !.*]]
 ;
   br label %.outer.preheader