Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4668,14 +4668,28 @@
   SetVector<Instruction *> Worklist;
   BasicBlock *Latch = TheLoop->getLoopLatch();
 
+  // Instructions that are scalar with predication must not be considered
+  // uniform after vectorization, because that would create an erroneous
+  // replicating region where only a single instance out of VF should be formed.
+  // TODO: optimize such seldom cases if found important, see PR40816.
+  auto WorklistInsert = [&](Instruction *I) -> void {
+    if (!isScalarWithPredication(I, VF)) {
+      Worklist.insert(I);
+      LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
+    }
+#ifndef NDEBUG
+    else
+      LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
+                        << *I << "\n");
+#endif /* NDEBUG */
+  };
+
   // Start with the conditional branch. If the branch condition is an
   // instruction contained in the loop that is only used by the branch, it is
   // uniform.
   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
-  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
-    Worklist.insert(Cmp);
-    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
-  }
+  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+    WorklistInsert(Cmp);
 
   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
   // are pointers that are treated like consecutive pointers during
@@ -4734,10 +4748,8 @@
   // Add to the Worklist all consecutive and consecutive-like pointers that
   // aren't also identified as possibly non-uniform.
   for (auto *V : ConsecutiveLikePtrs)
-    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
-      LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
-      Worklist.insert(V);
-    }
+    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
+      WorklistInsert(V);
 
   // Expand Worklist in topological order: whenever a new instruction
   // is added , its users should be already inside Worklist.  It ensures
@@ -4763,10 +4775,8 @@
             return Worklist.count(J) ||
                    (OI == getLoadStorePointerOperand(J) &&
                     isUniformDecision(J, VF));
-          })) {
-        Worklist.insert(OI);
-        LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
-      }
+          }))
+        WorklistInsert(OI);
     }
   }
 
@@ -4808,11 +4818,8 @@
       continue;
 
     // The induction variable and its update instruction will remain uniform.
-    Worklist.insert(Ind);
-    Worklist.insert(IndUpdate);
-    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
-    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
-                      << "\n");
+    WorklistInsert(Ind);
+    WorklistInsert(IndUpdate);
   }
 
   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
Index: llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -S | FileCheck %s -check-prefix=FORCE
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -64,4 +65,92 @@
   ret void
 }
 
+; CHECK-LABEL: PR40816
+;
+; Check that scalar with predication instructions are not considered uniform
+; after vectorization, because that results in replicating a region instead of
+; having a single instance (out of VF).
+;
+; CHECK:     LV: Found trip count: 3
+; CHECK:     LV: Found uniform instruction:   {{%.*}} = icmp eq i16 {{%.*}}, 0
+; CHECK-NOT: LV: Found uniform instruction:   {{%.*}} = load i16, i16* {{%.*}}, align 1
+; CHECK:     LV: Found not uniform being ScalarWithPredication:  {{%.*}} = load i16, i16* {{%.*}}, align 1
+; CHECK:     LV: Found scalar instruction:   {{%.*}} = getelementptr inbounds [5 x [3 x i16]], [5 x [3 x i16]]* @a, i16 0, i16 {{%.*}}, i16 {{%.*}}
+; FORCE-LABEL: @PR40816(
+; FORCE-NEXT:  entry:
+; FORCE-NEXT:    br i1 false, label {{%.*}}, label [[VECTOR_PH:%.*]]
+; FORCE:       vector.ph:
+; FORCE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FORCE:       vector.body:
+; FORCE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; FORCE-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; FORCE-NEXT:    [[VEC_IND3:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; FORCE-NEXT:    [[TMP0:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; FORCE-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; FORCE-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FORCE:       pred.store.if:
+; FORCE-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 0
+; FORCE-NEXT:    store i32 [[TMP2]], i32* @b, align 1
+; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FORCE:       pred.store.continue:
+; FORCE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; FORCE-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; FORCE:       pred.store.if1:
+; FORCE-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 1
+; FORCE-NEXT:    store i32 [[TMP4]], i32* @b, align 1
+; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; FORCE:       pred.store.continue2:
+; FORCE-NEXT:    [[TMP5:%.*]] = trunc i32 [[INDEX]] to i16
+; FORCE-NEXT:    [[TMP6:%.*]] = mul <2 x i16> [[VEC_IND3]], [[VEC_IND3]]
+; FORCE-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; FORCE-NEXT:    br i1 [[TMP7]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; FORCE:       pred.load.if:
+; FORCE-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
+; FORCE-NEXT:    [[TMP9:%.*]] = add i16 [[TMP5]], 0
+; FORCE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x [3 x i16]], [5 x [3 x i16]]* @a, i16 0, i16 [[TMP8]], i16 [[TMP9]]
+; FORCE-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP10]], align 1
+; FORCE-NEXT:    [[TMP12:%.*]] = insertelement <2 x i16> undef, i16 [[TMP11]], i32 0
+; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; FORCE:       pred.load.continue:
+; FORCE-NEXT:    [[TMP13:%.*]] = phi <2 x i16> [ undef, [[PRED_STORE_CONTINUE2]] ], [ [[TMP12]], [[PRED_LOAD_IF]] ]
+; FORCE-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; FORCE-NEXT:    br i1 [[TMP14]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
+; FORCE:       pred.load.if5:
+; FORCE-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
+; FORCE-NEXT:    [[TMP16:%.*]] = add i16 [[TMP5]], 1
+; FORCE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [5 x [3 x i16]], [5 x [3 x i16]]* @a, i16 0, i16 [[TMP15]], i16 [[TMP16]]
+; FORCE-NEXT:    [[TMP18:%.*]] = load i16, i16* [[TMP17]], align 1
+; FORCE-NEXT:    [[TMP19:%.*]] = insertelement <2 x i16> [[TMP13]], i16 [[TMP18]], i32 1
+; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; FORCE:       pred.load.continue6:
+; FORCE-NEXT:    [[TMP20:%.*]] = phi <2 x i16> [ [[TMP13]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], [[PRED_LOAD_IF5]] ]
+; FORCE-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; FORCE-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; FORCE-NEXT:    [[VEC_IND_NEXT4]] = add <2 x i16> [[VEC_IND3]], <i16 2, i16 2>
+; FORCE-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; FORCE-NEXT:    br i1 [[TMP21]], label {{%.*}}, label [[VECTOR_BODY]]
+;
+
+@a = internal constant [5 x [3 x i16]] [[3 x i16] [i16 7, i16 7, i16 7], [3 x i16] [i16 8, i16 8, i16 8], [3 x i16] [i16 4, i16 0, i16 5], [3 x i16] [i16 0, i16 4, i16 0], [3 x i16] zeroinitializer], align 1
+@b = external global i32, align 1
+
+define void @PR40816() #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  store i32 %storemerge, i32* @b, align 1
+  %0 = trunc i32 %storemerge to i16
+  %1 = mul i16 %0, %0
+  %arrayidx1 = getelementptr inbounds [5 x [3 x i16]], [5 x [3 x i16]]* @a, i16 0, i16 %1, i16 %0
+  %2 = load i16, i16* %arrayidx1, align 1
+  %cmp2 = icmp eq i16 %2, 0
+  %inc = add nuw nsw i32 %storemerge, 1
+  br i1 %cmp2, label %return, label %for.body
+
+return:                                           ; preds = %for.body
+  ret void
+}
+
 attributes #0 = { "target-cpu"="knl" }