diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2062,15 +2062,15 @@
   if (OrigLoop->isLoopInvariant(V))
     return V;
 
-  assert(Instance.Lane > 0
-             ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
-             : true && "Uniform values only have lane zero");
-
+  // Always use lane 0 for uniform values.
+  unsigned Lane = Instance.Lane;
+  if (Cost->isUniformAfterVectorization(cast<Instruction>(V), VF))
+    Lane = 0;
   // If the value from the original loop has not been vectorized, it is
   // represented by UF x VF scalar values in the new loop. Return the requested
   // scalar value.
-  if (VectorLoopValueMap.hasScalarValue(V, Instance))
-    return VectorLoopValueMap.getScalarValue(V, Instance);
+  if (VectorLoopValueMap.hasScalarValue(V, {Instance.Part, Lane}))
+    return VectorLoopValueMap.getScalarValue(V, {Instance.Part, Lane});
 
   // If the value has not been scalarized, get its entry in VectorLoopValueMap
   // for the given unroll part. If this entry is not a vector type (i.e., the
@@ -4671,11 +4671,11 @@
     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
   }
 
-  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
-  // are pointers that are treated like consecutive pointers during
-  // vectorization. The pointer operands of interleaved accesses are an
-  // example.
-  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
+  // Holds consecutive and consecutive-like pointers, as well as trivially loop
+  // invariant instructions. Consecutive-like pointers are pointers that are
+  // treated like consecutive pointers during vectorization. The pointer
+  // operands of interleaved accesses are an example.
+  SmallSetVector<Instruction *, 8> PotentialUniformRoots;
 
   // Holds pointer operands of instructions that are possibly non-uniform.
   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
@@ -4699,6 +4699,16 @@
   // the getelementptr won't remain uniform.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
+      // Instructions with loop invariant operands are uniform, as long as
+      // they do not read or write memory, are PHI nodes or terminators.
+      if (&I != BB->getTerminator() && !I.mayReadOrWriteMemory() &&
+          !isa<PHINode>(&I) && all_of(I.operands(), [this](Use &U) {
+            return this->TheLoop->isLoopInvariant(U);
+          })) {
+        PotentialUniformRoots.insert(&I);
+        continue;
+      }
+
       // If there's no pointer operand, there's nothing to do.
       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
       if (!Ptr)
@@ -4722,12 +4732,12 @@
       // is consecutive-like, or interleaving - the pointer operand should
       // remain uniform.
       else
-        ConsecutiveLikePtrs.insert(Ptr);
+        PotentialUniformRoots.insert(Ptr);
     }
 
   // Add to the Worklist all consecutive and consecutive-like pointers that
   // aren't also identified as possibly non-uniform.
-  for (auto *V : ConsecutiveLikePtrs)
+  for (auto *V : PotentialUniformRoots)
     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
       LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
       Worklist.insert(V);
@@ -4754,7 +4764,7 @@
       auto *OI = cast<Instruction>(OV);
       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
             auto *J = cast<Instruction>(U);
-            return Worklist.count(J) ||
+            return Worklist.count(J) || TheLoop->isLoopInvariant(U) ||
                    (OI == getLoadStorePointerOperand(J) &&
                     isUniformDecision(J, VF));
           })) {
@@ -5481,8 +5491,6 @@
 int LoopVectorizationCostModel::computePredInstDiscount(
     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
     unsigned VF) {
-  assert(!isUniformAfterVectorization(PredInst, VF) &&
-         "Instruction marked uniform-after-vectorization will be predicated");
 
   // Initialize the discount to zero, meaning that the scalar version and the
   // vector version cost the same.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -9,8 +9,8 @@
 ; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2.
 
 ; CM: LV: Scalar loop costs: 7.
-; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
+; CM: LV: Found an estimated cost of 1 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
+; CM-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
 
 ; Check that the extractvalue operands are actually free in vector code.
 
@@ -21,21 +21,19 @@
 ; FORCED-NEXT:    %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
 ; FORCED-NEXT:    %0 = add i32 %index, 0
 ; FORCED-NEXT:    %1 = extractvalue { i64, i64 } %sv, 0
-; FORCED-NEXT:    %2 = extractvalue { i64, i64 } %sv, 0
-; FORCED-NEXT:    %3 = insertelement <2 x i64> undef, i64 %1, i32 0
-; FORCED-NEXT:    %4 = insertelement <2 x i64> %3, i64 %2, i32 1
-; FORCED-NEXT:    %5 = extractvalue { i64, i64 } %sv, 1
-; FORCED-NEXT:    %6 = extractvalue { i64, i64 } %sv, 1
-; FORCED-NEXT:    %7 = insertelement <2 x i64> undef, i64 %5, i32 0
-; FORCED-NEXT:    %8 = insertelement <2 x i64> %7, i64 %6, i32 1
-; FORCED-NEXT:    %9 = getelementptr i64, i64* %dst, i32 %0
-; FORCED-NEXT:    %10 = add <2 x i64> %4, %8
-; FORCED-NEXT:    %11 = getelementptr i64, i64* %9, i32 0
-; FORCED-NEXT:    %12 = bitcast i64* %11 to <2 x i64>*
-; FORCED-NEXT:    store <2 x i64> %10, <2 x i64>* %12, align 4
+; FORCED-NEXT:    %broadcast.splatinsert1 = insertelement <2 x i64> undef, i64 %1, i32 0
+; FORCED-NEXT:    %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> undef, <2 x i32> zeroinitializer
+; FORCED-NEXT:    %2 = extractvalue { i64, i64 } %sv, 1
+; FORCED-NEXT:    %broadcast.splatinsert3 = insertelement <2 x i64> undef, i64 %2, i32 0
+; FORCED-NEXT:    %broadcast.splat4 = shufflevector <2 x i64> %broadcast.splatinsert3, <2 x i64> undef, <2 x i32> zeroinitializer
+; FORCED-NEXT:    %3 = getelementptr i64, i64* %dst, i32 %0
+; FORCED-NEXT:    %4 = add <2 x i64> %broadcast.splat2, %broadcast.splat4
+; FORCED-NEXT:    %5 = getelementptr i64, i64* %3, i32 0
+; FORCED-NEXT:    %6 = bitcast i64* %5 to <2 x i64>*
+; FORCED-NEXT:    store <2 x i64> %4, <2 x i64>* %6, align 4
 ; FORCED-NEXT:    %index.next = add i32 %index, 2
-; FORCED-NEXT:    %13 = icmp eq i32 %index.next, 0
-; FORCED-NEXT:    br i1 %13, label %middle.block, label %vector.body, !llvm.loop !0
+; FORCED-NEXT:    %7 = icmp eq i32 %index.next, 0
+; FORCED-NEXT:    br i1 %7, label %middle.block, label %vector.body, !llvm.loop !0
 
 define void @test1(i64* %dst, {i64, i64} %sv) {
 entry:
@@ -61,8 +59,8 @@
 declare float @pow(float, float) readnone nounwind
 
 ; CM: LV: Scalar loop costs: 16.
-; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
+; CM: LV: Found an estimated cost of 1 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
+; CM-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
 
 ; FORCED-LABEL: define void @test_getVectorCallCost
 
@@ -73,21 +71,19 @@
 ; FORCED-NEXT:    %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
 ; FORCED-NEXT:    %0 = add i32 %index, 0
 ; FORCED-NEXT:    %1 = extractvalue { float, float } %sv, 0
-; FORCED-NEXT:    %2 = extractvalue { float, float } %sv, 0
-; FORCED-NEXT:    %3 = insertelement <2 x float> undef, float %1, i32 0
-; FORCED-NEXT:    %4 = insertelement <2 x float> %3, float %2, i32 1
-; FORCED-NEXT:    %5 = extractvalue { float, float } %sv, 1
-; FORCED-NEXT:    %6 = extractvalue { float, float } %sv, 1
-; FORCED-NEXT:    %7 = insertelement <2 x float> undef, float %5, i32 0
-; FORCED-NEXT:    %8 = insertelement <2 x float> %7, float %6, i32 1
-; FORCED-NEXT:    %9 = getelementptr float, float* %dst, i32 %0
-; FORCED-NEXT:    %10 = call <2 x float> @llvm.pow.v2f32(<2 x float> %4, <2 x float> %8)
-; FORCED-NEXT:    %11 = getelementptr float, float* %9, i32 0
-; FORCED-NEXT:    %12 = bitcast float* %11 to <2 x float>*
-; FORCED-NEXT:    store <2 x float> %10, <2 x float>* %12, align 4
+; FORCED-NEXT:    %broadcast.splatinsert1 = insertelement <2 x float> undef, float %1, i32 0
+; FORCED-NEXT:    %broadcast.splat2 = shufflevector <2 x float> %broadcast.splatinsert1, <2 x float> undef, <2 x i32> zeroinitializer
+; FORCED-NEXT:    %2 = extractvalue { float, float } %sv, 1
+; FORCED-NEXT:    %broadcast.splatinsert3 = insertelement <2 x float> undef, float %2, i32 0
+; FORCED-NEXT:    %broadcast.splat4 = shufflevector <2 x float> %broadcast.splatinsert3, <2 x float> undef, <2 x i32> zeroinitializer
+; FORCED-NEXT:    %3 = getelementptr float, float* %dst, i32 %0
+; FORCED-NEXT:    %4 = call <2 x float> @llvm.pow.v2f32(<2 x float> %broadcast.splat2, <2 x float> %broadcast.splat4)
+; FORCED-NEXT:    %5 = getelementptr float, float* %3, i32 0
+; FORCED-NEXT:    %6 = bitcast float* %5 to <2 x float>*
+; FORCED-NEXT:    store <2 x float> %4, <2 x float>* %6, align 4
 ; FORCED-NEXT:    %index.next = add i32 %index, 2
-; FORCED-NEXT:    %13 = icmp eq i32 %index.next, 0
-; FORCED-NEXT:    br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4
+; FORCED-NEXT:    %7 = icmp eq i32 %index.next, 0
+; FORCED-NEXT:    br i1 %7, label %middle.block, label %vector.body, !llvm.loop !4
 
 define void @test_getVectorCallCost(float* %dst, {float, float} %sv) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/assume.ll b/llvm/test/Transforms/LoopVectorize/X86/assume.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/assume.ll
@@ -66,18 +66,6 @@
 ; CHECK: @llvm.assume
 ; CHECK: @llvm.assume
 ; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
 ; CHECK: for.body:
 ; CHECK: ret void
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -23,15 +23,39 @@
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], <i16 0, i16 1>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16*, i16** [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>*
-; CHECK-NEXT:    store <2 x i16*> <i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 0), i32 0, i32 0), i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 1), i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 0 to i64
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, <2 x i64> [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x %rec8*> [[TMP2]] to <2 x i16*>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16*, i16** [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16** [[TMP6]] to <2 x i16*>*
+; CHECK-NEXT:    store <2 x i16*> [[TMP3]], <2 x i16*>* [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 2, 2
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[BB1:%.*]] ]
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[C_1_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ]
+; CHECK-NEXT:    [[_TMP1:%.*]] = zext i16 0 to i64
+; CHECK-NEXT:    [[_TMP2:%.*]] = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 [[_TMP1]]
+; CHECK-NEXT:    [[_TMP4:%.*]] = bitcast %rec8* [[_TMP2]] to i16*
+; CHECK-NEXT:    [[_TMP6:%.*]] = sext i16 [[C_1_0]] to i64
+; CHECK-NEXT:    [[_TMP7:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[_TMP6]]
+; CHECK-NEXT:    store i16* [[_TMP4]], i16** [[_TMP7]]
+; CHECK-NEXT:    [[_TMP9]] = add nsw i16 [[C_1_0]], 1
+; CHECK-NEXT:    [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
+; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop !2
+; CHECK:       bb3:
+; CHECK-NEXT:    ret void
+;
 
 bb1:
   br label %bb2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
@@ -18,16 +18,16 @@
 ; CHECK:       if.then:
 ; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> undef, i8 [[X:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* null, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[BROADCAST_SPLAT]], <i32 24, i32 24, i32 24, i32 24>
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[P:%.*]], align 1, !tbaa !1
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1
@@ -43,19 +43,25 @@
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i8, i8* undef, align 1, !tbaa !1
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* undef, align 1, !tbaa !1
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* undef, align 1, !tbaa !1
-; CHECK-NEXT:    [[TMP19:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i32 0
-; CHECK-NEXT:    store i32 [[TMP21]], i32* undef, align 4, !tbaa !4
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP20]], i32 1
-; CHECK-NEXT:    store i32 [[TMP22]], i32* undef, align 4, !tbaa !4
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP20]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i32 undef, 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[TMP19]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP14]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i8 undef to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP21]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i32> [[TMP20]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP22]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP23]], i32* undef, align 4, !tbaa !4
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP20]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP22]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP24]], i32* undef, align 4, !tbaa !4
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP22]], i32 2
+; CHECK-NEXT:    store i32 [[TMP25]], i32* undef, align 4, !tbaa !4
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3
+; CHECK-NEXT:    store i32 [[TMP26]], i32* undef, align 4, !tbaa !4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1, 0
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]]
@@ -66,11 +72,11 @@
 ; CHECK-NEXT:    [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[CONV70:%.*]] = zext i8 [[X]] to i32
 ; CHECK-NEXT:    [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24
-; CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1
-; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP26]] to i32
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1
+; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP28]] to i32
 ; CHECK-NEXT:    [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
 ; CHECK-NEXT:    [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i8, i8* undef, align 1, !tbaa !1
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, i8* undef, align 1, !tbaa !1
 ; CHECK-NEXT:    [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
 ; CHECK-NEXT:    [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
 ; CHECK-NEXT:    [[CONV81:%.*]] = zext i8 undef to i32
diff --git a/llvm/test/Transforms/LoopVectorize/X86/funclet.ll b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
@@ -2,6 +2,8 @@
 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
 target triple = "i686-pc-windows-msvc18.0.0"
 
+; Loop invariant call to @floor is uniform, which means we do not end up
+; with any vector instructions in the loop.
 define void @test1() #0 personality i32 (...)* @__CxxFrameHandler3 {
 entry:
   invoke void @_CxxThrowException(i8* null, i8* null)
@@ -33,7 +35,43 @@
 
 ; CHECK-LABEL: define void @test1(
 ; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null]
-; CHECK: call <16 x double> @llvm.floor.v16f64(<16 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ]
+; CHECK: call double @floor(double 1.000000e+00) #1 [ "funclet"(token %1) ]
+
+define void @test2(double* %A) #0 personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @_CxxThrowException(i8* null, i8* null)
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null, i32 64, i8* null]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  catchret from %1 to label %try.cont
+
+for.body:                                         ; preds = %for.body, %catch
+  %i.07 = phi i32 [ 0, %catch ], [ %inc, %for.body ]
+  %A.ptr = getelementptr double, double* %A, i32 %i.07
+  %A.val = load double, double* %A.ptr
+  %call = call double @floor(double %A.val) #1 [ "funclet"(token %1) ]
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+try.cont:                                         ; preds = %for.cond.cleanup
+  ret void
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+; CHECK-LABEL: define void @test2(
+; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null]
+; CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %wide.load) [ "funclet"(token %1) ]
+
 
 declare x86_stdcallcc void @_CxxThrowException(i8*, i8*)
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -25,23 +25,25 @@
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT5]], <16 x i32*> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT5]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT9]], <16 x i32*> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT6]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32* [[A]], null
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i1> undef, i1 [[TMP3]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT7]], <16 x i1> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
-; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT6]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT6]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1>
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT10]], i32 4, <16 x i1> [[BROADCAST_SPLAT8]], <16 x i32> undef), !alias.scope !3
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[BROADCAST_SPLAT8]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1>
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i32> [[PREDPHI]], i32 15
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -6,21 +6,50 @@
 
 ; first test checks that loop with a reduction and a uniform store gets
 ; vectorized.
-; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction
-; CHECK-LABEL: vector.memcheck:
-; CHECK:    found.conflict
-
-; CHECK-LABEL: vector.body:
-; CHECK:         %vec.phi = phi <16 x i32>  [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:         %wide.load = load <16 x i32>
-; CHECK:         [[ADD]] = add <16 x i32> %vec.phi, %wide.load
-; CHECK:         store i32 %ntrunc, i32* %a
-; CHECK-NOT:     store i32 %ntrunc, i32* %a
-; CHECK:         %index.next = add i64 %index, 64
-
-; CHECK-LABEL: middle.block:
-; CHECK:         %rdx.shuf = shufflevector <16 x i32>
 define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP10:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP11:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP12:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP13:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 32
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 48
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP10]] = add <16 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP11]] = add <16 x i32> [[VEC_PHI8]], [[WIDE_LOAD11]]
+; CHECK-NEXT:    [[TMP12]] = add <16 x i32> [[VEC_PHI9]], [[WIDE_LOAD12]]
+; CHECK-NEXT:    [[TMP13]] = add <16 x i32> [[VEC_PHI10]], [[WIDE_LOAD13]]
+; CHECK-NEXT:    store i32 %ntrunc, i32* %a, align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], %n.vec
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[BIN_RDX14:%.*]] = add <16 x i32> [[TMP12]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX15:%.*]] = add <16 x i32> [[TMP13]], [[BIN_RDX14]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[BIN_RDX15]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <16 x i32> [[BIN_RDX15]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF17:%.*]] = shufflevector <16 x i32> [[BIN_RDX16]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX18:%.*]] = add <16 x i32> [[BIN_RDX16]], [[RDX_SHUF17]]
+; CHECK-NEXT:    [[RDX_SHUF19:%.*]] = shufflevector <16 x i32> [[BIN_RDX18]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX20:%.*]] = add <16 x i32> [[BIN_RDX18]], [[RDX_SHUF19]]
+; CHECK-NEXT:    [[RDX_SHUF21:%.*]] = shufflevector <16 x i32> [[BIN_RDX20]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX22:%.*]] = add <16 x i32> [[BIN_RDX20]], [[RDX_SHUF21]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i32> [[BIN_RDX22]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 %smax, %n.vec
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label %scalar.ph
+
 entry:
   %ntrunc = trunc i64 %n to i32
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -537,10 +537,10 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE36:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE36]] ]
-; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP181:%.*]], [[PRED_LOAD_CONTINUE36]] ]
-; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP182:%.*]], [[PRED_LOAD_CONTINUE36]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP183:%.*]], [[PRED_LOAD_CONTINUE36]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP168:%.*]], [[PRED_LOAD_CONTINUE36]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP169:%.*]], [[PRED_LOAD_CONTINUE36]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP170:%.*]], [[PRED_LOAD_CONTINUE36]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP171:%.*]], [[PRED_LOAD_CONTINUE36]] ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
@@ -611,211 +611,199 @@
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0
-; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP64:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP65:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP67:%.*]] = bitcast i16* [[TMP66]] to i32*
-; CHECK-NEXT:    [[TMP68:%.*]] = load i32, i32* [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> undef, i32 [[TMP68]], i32 0
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP67:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0
+; CHECK-NEXT:    br i1 [[TMP68]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i16, i16* [[TMP64]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP70:%.*]] = bitcast i16* [[TMP69]] to i32*
+; CHECK-NEXT:    [[TMP71:%.*]] = load i32, i32* [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i32> undef, i32 [[TMP71]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP70:%.*]] = phi <4 x i32> [ undef, [[VECTOR_BODY]] ], [ [[TMP69]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1
-; CHECK-NEXT:    br i1 [[TMP71]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
+; CHECK-NEXT:    [[TMP73:%.*]] = phi <4 x i32> [ undef, [[VECTOR_BODY]] ], [ [[TMP72]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1
+; CHECK-NEXT:    br i1 [[TMP74]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
 ; CHECK:       pred.load.if7:
-; CHECK-NEXT:    [[TMP72:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i16, i16* [[TMP72]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP74:%.*]] = bitcast i16* [[TMP73]] to i32*
-; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* [[TMP74]]
-; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP75]], i32 1
+; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i16, i16* [[TMP64]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP76:%.*]] = bitcast i16* [[TMP75]] to i32*
+; CHECK-NEXT:    [[TMP77:%.*]] = load i32, i32* [[TMP76]]
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP77]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
 ; CHECK:       pred.load.continue8:
-; CHECK-NEXT:    [[TMP77:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP76]], [[PRED_LOAD_IF7]] ]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2
-; CHECK-NEXT:    br i1 [[TMP78]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
+; CHECK-NEXT:    [[TMP79:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP78]], [[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2
+; CHECK-NEXT:    br i1 [[TMP80]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
 ; CHECK:       pred.load.if9:
-; CHECK-NEXT:    [[TMP79:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i16, i16* [[TMP79]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP81:%.*]] = bitcast i16* [[TMP80]] to i32*
-; CHECK-NEXT:    [[TMP82:%.*]] = load i32, i32* [[TMP81]]
-; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP82]], i32 2
+; CHECK-NEXT:    [[TMP81:%.*]] = getelementptr inbounds i16, i16* [[TMP64]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP82:%.*]] = bitcast i16* [[TMP81]] to i32*
+; CHECK-NEXT:    [[TMP83:%.*]] = load i32, i32* [[TMP82]]
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> [[TMP79]], i32 [[TMP83]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
 ; CHECK:       pred.load.continue10:
-; CHECK-NEXT:    [[TMP84:%.*]] = phi <4 x i32> [ [[TMP77]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP83]], [[PRED_LOAD_IF9]] ]
-; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3
-; CHECK-NEXT:    br i1 [[TMP85]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; CHECK-NEXT:    [[TMP85:%.*]] = phi <4 x i32> [ [[TMP79]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP84]], [[PRED_LOAD_IF9]] ]
+; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3
+; CHECK-NEXT:    br i1 [[TMP86]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
 ; CHECK:       pred.load.if11:
-; CHECK-NEXT:    [[TMP86:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP86]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP64]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = bitcast i16* [[TMP87]] to i32*
 ; CHECK-NEXT:    [[TMP89:%.*]] = load i32, i32* [[TMP88]]
-; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP89]], i32 3
+; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP89]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
 ; CHECK:       pred.load.continue12:
-; CHECK-NEXT:    [[TMP91:%.*]] = phi <4 x i32> [ [[TMP84]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP90]], [[PRED_LOAD_IF11]] ]
+; CHECK-NEXT:    [[TMP91:%.*]] = phi <4 x i32> [ [[TMP85]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP90]], [[PRED_LOAD_IF11]] ]
 ; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
 ; CHECK:       pred.load.if13:
-; CHECK-NEXT:    [[TMP93:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i16, i16* [[TMP93]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP95:%.*]] = bitcast i16* [[TMP94]] to i32*
-; CHECK-NEXT:    [[TMP96:%.*]] = load i32, i32* [[TMP95]]
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> undef, i32 [[TMP96]], i32 0
+; CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP94:%.*]] = bitcast i16* [[TMP93]] to i32*
+; CHECK-NEXT:    [[TMP95:%.*]] = load i32, i32* [[TMP94]]
+; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i32> undef, i32 [[TMP95]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK:       pred.load.continue14:
-; CHECK-NEXT:    [[TMP98:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP97]], [[PRED_LOAD_IF13]] ]
-; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1
-; CHECK-NEXT:    br i1 [[TMP99]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
+; CHECK-NEXT:    [[TMP97:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP96]], [[PRED_LOAD_IF13]] ]
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1
+; CHECK-NEXT:    br i1 [[TMP98]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
 ; CHECK:       pred.load.if15:
-; CHECK-NEXT:    [[TMP100:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds i16, i16* [[TMP100]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP102:%.*]] = bitcast i16* [[TMP101]] to i32*
-; CHECK-NEXT:    [[TMP103:%.*]] = load i32, i32* [[TMP102]]
-; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP103]], i32 1
+; CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP100:%.*]] = bitcast i16* [[TMP99]] to i32*
+; CHECK-NEXT:    [[TMP101:%.*]] = load i32, i32* [[TMP100]]
+; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP97]], i32 [[TMP101]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
 ; CHECK:       pred.load.continue16:
-; CHECK-NEXT:    [[TMP105:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP104]], [[PRED_LOAD_IF15]] ]
-; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2
-; CHECK-NEXT:    br i1 [[TMP106]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
+; CHECK-NEXT:    [[TMP103:%.*]] = phi <4 x i32> [ [[TMP97]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP102]], [[PRED_LOAD_IF15]] ]
+; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2
+; CHECK-NEXT:    br i1 [[TMP104]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
 ; CHECK:       pred.load.if17:
-; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds i16, i16* [[TMP107]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP109:%.*]] = bitcast i16* [[TMP108]] to i32*
-; CHECK-NEXT:    [[TMP110:%.*]] = load i32, i32* [[TMP109]]
-; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <4 x i32> [[TMP105]], i32 [[TMP110]], i32 2
+; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP106:%.*]] = bitcast i16* [[TMP105]] to i32*
+; CHECK-NEXT:    [[TMP107:%.*]] = load i32, i32* [[TMP106]]
+; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <4 x i32> [[TMP103]], i32 [[TMP107]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
 ; CHECK:       pred.load.continue18:
-; CHECK-NEXT:    [[TMP112:%.*]] = phi <4 x i32> [ [[TMP105]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP111]], [[PRED_LOAD_IF17]] ]
-; CHECK-NEXT:    [[TMP113:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3
-; CHECK-NEXT:    br i1 [[TMP113]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
+; CHECK-NEXT:    [[TMP109:%.*]] = phi <4 x i32> [ [[TMP103]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP108]], [[PRED_LOAD_IF17]] ]
+; CHECK-NEXT:    [[TMP110:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3
+; CHECK-NEXT:    br i1 [[TMP110]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
 ; CHECK:       pred.load.if19:
-; CHECK-NEXT:    [[TMP114:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i16, i16* [[TMP114]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP116:%.*]] = bitcast i16* [[TMP115]] to i32*
-; CHECK-NEXT:    [[TMP117:%.*]] = load i32, i32* [[TMP116]]
-; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <4 x i32> [[TMP112]], i32 [[TMP117]], i32 3
+; CHECK-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP112:%.*]] = bitcast i16* [[TMP111]] to i32*
+; CHECK-NEXT:    [[TMP113:%.*]] = load i32, i32* [[TMP112]]
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP113]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
 ; CHECK:       pred.load.continue20:
-; CHECK-NEXT:    [[TMP119:%.*]] = phi <4 x i32> [ [[TMP112]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP118]], [[PRED_LOAD_IF19]] ]
-; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0
-; CHECK-NEXT:    br i1 [[TMP120]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
+; CHECK-NEXT:    [[TMP115:%.*]] = phi <4 x i32> [ [[TMP109]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP114]], [[PRED_LOAD_IF19]] ]
+; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0
+; CHECK-NEXT:    br i1 [[TMP116]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
 ; CHECK:       pred.load.if21:
-; CHECK-NEXT:    [[TMP121:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds i16, i16* [[TMP121]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP123:%.*]] = bitcast i16* [[TMP122]] to i32*
-; CHECK-NEXT:    [[TMP124:%.*]] = load i32, i32* [[TMP123]]
-; CHECK-NEXT:    [[TMP125:%.*]] = insertelement <4 x i32> undef, i32 [[TMP124]], i32 0
+; CHECK-NEXT:    [[TMP117:%.*]] = getelementptr inbounds i16, i16* [[TMP66]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP118:%.*]] = bitcast i16* [[TMP117]] to i32*
+; CHECK-NEXT:    [[TMP119:%.*]] = load i32, i32* [[TMP118]]
+; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> undef, i32 [[TMP119]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
 ; CHECK:       pred.load.continue22:
-; CHECK-NEXT:    [[TMP126:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP125]], [[PRED_LOAD_IF21]] ]
-; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1
-; CHECK-NEXT:    br i1 [[TMP127]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
+; CHECK-NEXT:    [[TMP121:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP120]], [[PRED_LOAD_IF21]] ]
+; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1
+; CHECK-NEXT:    br i1 [[TMP122]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
 ; CHECK:       pred.load.if23:
-; CHECK-NEXT:    [[TMP128:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i16, i16* [[TMP128]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP130:%.*]] = bitcast i16* [[TMP129]] to i32*
-; CHECK-NEXT:    [[TMP131:%.*]] = load i32, i32* [[TMP130]]
-; CHECK-NEXT:    [[TMP132:%.*]] = insertelement <4 x i32> [[TMP126]], i32 [[TMP131]], i32 1
+; CHECK-NEXT:    [[TMP123:%.*]] = getelementptr inbounds i16, i16* [[TMP66]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP124:%.*]] = bitcast i16* [[TMP123]] to i32*
+; CHECK-NEXT:    [[TMP125:%.*]] = load i32, i32* [[TMP124]]
+; CHECK-NEXT:    [[TMP126:%.*]] = insertelement <4 x i32> [[TMP121]], i32 [[TMP125]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
 ; CHECK:       pred.load.continue24:
-; CHECK-NEXT:    [[TMP133:%.*]] = phi <4 x i32> [ [[TMP126]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP132]], [[PRED_LOAD_IF23]] ]
-; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2
-; CHECK-NEXT:    br i1 [[TMP134]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
+; CHECK-NEXT:    [[TMP127:%.*]] = phi <4 x i32> [ [[TMP121]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP126]], [[PRED_LOAD_IF23]] ]
+; CHECK-NEXT:    [[TMP128:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2
+; CHECK-NEXT:    br i1 [[TMP128]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
 ; CHECK:       pred.load.if25:
-; CHECK-NEXT:    [[TMP135:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i16, i16* [[TMP135]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP137:%.*]] = bitcast i16* [[TMP136]] to i32*
-; CHECK-NEXT:    [[TMP138:%.*]] = load i32, i32* [[TMP137]]
-; CHECK-NEXT:    [[TMP139:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP138]], i32 2
+; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i16, i16* [[TMP66]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP130:%.*]] = bitcast i16* [[TMP129]] to i32*
+; CHECK-NEXT:    [[TMP131:%.*]] = load i32, i32* [[TMP130]]
+; CHECK-NEXT:    [[TMP132:%.*]] = insertelement <4 x i32> [[TMP127]], i32 [[TMP131]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
 ; CHECK:       pred.load.continue26:
-; CHECK-NEXT:    [[TMP140:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP139]], [[PRED_LOAD_IF25]] ]
-; CHECK-NEXT:    [[TMP141:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3
-; CHECK-NEXT:    br i1 [[TMP141]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
+; CHECK-NEXT:    [[TMP133:%.*]] = phi <4 x i32> [ [[TMP127]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP132]], [[PRED_LOAD_IF25]] ]
+; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3
+; CHECK-NEXT:    br i1 [[TMP134]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
 ; CHECK:       pred.load.if27:
-; CHECK-NEXT:    [[TMP142:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP143:%.*]] = getelementptr inbounds i16, i16* [[TMP142]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP144:%.*]] = bitcast i16* [[TMP143]] to i32*
-; CHECK-NEXT:    [[TMP145:%.*]] = load i32, i32* [[TMP144]]
-; CHECK-NEXT:    [[TMP146:%.*]] = insertelement <4 x i32> [[TMP140]], i32 [[TMP145]], i32 3
+; CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i16, i16* [[TMP66]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP136:%.*]] = bitcast i16* [[TMP135]] to i32*
+; CHECK-NEXT:    [[TMP137:%.*]] = load i32, i32* [[TMP136]]
+; CHECK-NEXT:    [[TMP138:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP137]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
 ; CHECK:       pred.load.continue28:
-; CHECK-NEXT:    [[TMP147:%.*]] = phi <4 x i32> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP146]], [[PRED_LOAD_IF27]] ]
-; CHECK-NEXT:    [[TMP148:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0
-; CHECK-NEXT:    br i1 [[TMP148]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK-NEXT:    [[TMP139:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP138]], [[PRED_LOAD_IF27]] ]
+; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0
+; CHECK-NEXT:    br i1 [[TMP140]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
 ; CHECK:       pred.load.if29:
-; CHECK-NEXT:    [[TMP149:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP150:%.*]] = getelementptr inbounds i16, i16* [[TMP149]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP151:%.*]] = bitcast i16* [[TMP150]] to i32*
-; CHECK-NEXT:    [[TMP152:%.*]] = load i32, i32* [[TMP151]]
-; CHECK-NEXT:    [[TMP153:%.*]] = insertelement <4 x i32> undef, i32 [[TMP152]], i32 0
+; CHECK-NEXT:    [[TMP141:%.*]] = getelementptr inbounds i16, i16* [[TMP67]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP142:%.*]] = bitcast i16* [[TMP141]] to i32*
+; CHECK-NEXT:    [[TMP143:%.*]] = load i32, i32* [[TMP142]]
+; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <4 x i32> undef, i32 [[TMP143]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
 ; CHECK:       pred.load.continue30:
-; CHECK-NEXT:    [[TMP154:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP153]], [[PRED_LOAD_IF29]] ]
-; CHECK-NEXT:    [[TMP155:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1
-; CHECK-NEXT:    br i1 [[TMP155]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
+; CHECK-NEXT:    [[TMP145:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP144]], [[PRED_LOAD_IF29]] ]
+; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1
+; CHECK-NEXT:    br i1 [[TMP146]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
 ; CHECK:       pred.load.if31:
-; CHECK-NEXT:    [[TMP156:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP157:%.*]] = getelementptr inbounds i16, i16* [[TMP156]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP158:%.*]] = bitcast i16* [[TMP157]] to i32*
-; CHECK-NEXT:    [[TMP159:%.*]] = load i32, i32* [[TMP158]]
-; CHECK-NEXT:    [[TMP160:%.*]] = insertelement <4 x i32> [[TMP154]], i32 [[TMP159]], i32 1
+; CHECK-NEXT:    [[TMP147:%.*]] = getelementptr inbounds i16, i16* [[TMP67]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP148:%.*]] = bitcast i16* [[TMP147]] to i32*
+; CHECK-NEXT:    [[TMP149:%.*]] = load i32, i32* [[TMP148]]
+; CHECK-NEXT:    [[TMP150:%.*]] = insertelement <4 x i32> [[TMP145]], i32 [[TMP149]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
 ; CHECK:       pred.load.continue32:
-; CHECK-NEXT:    [[TMP161:%.*]] = phi <4 x i32> [ [[TMP154]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP160]], [[PRED_LOAD_IF31]] ]
-; CHECK-NEXT:    [[TMP162:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2
-; CHECK-NEXT:    br i1 [[TMP162]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
+; CHECK-NEXT:    [[TMP151:%.*]] = phi <4 x i32> [ [[TMP145]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP150]], [[PRED_LOAD_IF31]] ]
+; CHECK-NEXT:    [[TMP152:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2
+; CHECK-NEXT:    br i1 [[TMP152]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
 ; CHECK:       pred.load.if33:
-; CHECK-NEXT:    [[TMP163:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i16, i16* [[TMP163]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP165:%.*]] = bitcast i16* [[TMP164]] to i32*
-; CHECK-NEXT:    [[TMP166:%.*]] = load i32, i32* [[TMP165]]
-; CHECK-NEXT:    [[TMP167:%.*]] = insertelement <4 x i32> [[TMP161]], i32 [[TMP166]], i32 2
+; CHECK-NEXT:    [[TMP153:%.*]] = getelementptr inbounds i16, i16* [[TMP67]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP154:%.*]] = bitcast i16* [[TMP153]] to i32*
+; CHECK-NEXT:    [[TMP155:%.*]] = load i32, i32* [[TMP154]]
+; CHECK-NEXT:    [[TMP156:%.*]] = insertelement <4 x i32> [[TMP151]], i32 [[TMP155]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
 ; CHECK:       pred.load.continue34:
-; CHECK-NEXT:    [[TMP168:%.*]] = phi <4 x i32> [ [[TMP161]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP167]], [[PRED_LOAD_IF33]] ]
-; CHECK-NEXT:    [[TMP169:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3
-; CHECK-NEXT:    br i1 [[TMP169]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36]]
+; CHECK-NEXT:    [[TMP157:%.*]] = phi <4 x i32> [ [[TMP151]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP156]], [[PRED_LOAD_IF33]] ]
+; CHECK-NEXT:    [[TMP158:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3
+; CHECK-NEXT:    br i1 [[TMP158]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36]]
 ; CHECK:       pred.load.if35:
-; CHECK-NEXT:    [[TMP170:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP171:%.*]] = getelementptr inbounds i16, i16* [[TMP170]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP172:%.*]] = bitcast i16* [[TMP171]] to i32*
-; CHECK-NEXT:    [[TMP173:%.*]] = load i32, i32* [[TMP172]]
-; CHECK-NEXT:    [[TMP174:%.*]] = insertelement <4 x i32> [[TMP168]], i32 [[TMP173]], i32 3
+; CHECK-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i16, i16* [[TMP67]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP160:%.*]] = bitcast i16* [[TMP159]] to i32*
+; CHECK-NEXT:    [[TMP161:%.*]] = load i32, i32* [[TMP160]]
+; CHECK-NEXT:    [[TMP162:%.*]] = insertelement <4 x i32> [[TMP157]], i32 [[TMP161]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
 ; CHECK:       pred.load.continue36:
-; CHECK-NEXT:    [[TMP175:%.*]] = phi <4 x i32> [ [[TMP168]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP174]], [[PRED_LOAD_IF35]] ]
-; CHECK-NEXT:    [[TMP176:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP177:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP178:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP179:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP163:%.*]] = phi <4 x i32> [ [[TMP157]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP162]], [[PRED_LOAD_IF35]] ]
+; CHECK-NEXT:    [[TMP164:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP165:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP166:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP167:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP91]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI37:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP119]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI38:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP147]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI39:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP175]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP180]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP181]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI37]]
-; CHECK-NEXT:    [[TMP182]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI38]]
-; CHECK-NEXT:    [[TMP183]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI39]]
+; CHECK-NEXT:    [[PREDPHI37:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP115]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI38:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP139]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI39:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP163]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP168]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP169]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI37]]
+; CHECK-NEXT:    [[TMP170]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI38]]
+; CHECK-NEXT:    [[TMP171]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI39]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP184:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP184]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK-NEXT:    [[TMP172:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP172]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP181]], [[TMP180]]
-; CHECK-NEXT:    [[BIN_RDX40:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX41:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX40]]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP169]], [[TMP168]]
+; CHECK-NEXT:    [[BIN_RDX40:%.*]] = add <4 x i32> [[TMP170]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX41:%.*]] = add <4 x i32> [[TMP171]], [[BIN_RDX40]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX41]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX42:%.*]] = add <4 x i32> [[BIN_RDX41]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF43:%.*]] = shufflevector <4 x i32> [[BIN_RDX42]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX44:%.*]] = add <4 x i32> [[BIN_RDX42]], [[RDX_SHUF43]]
-; CHECK-NEXT:    [[TMP185:%.*]] = extractelement <4 x i32> [[BIN_RDX44]], i32 0
+; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <4 x i32> [[BIN_RDX44]], i32 0
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP173]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -836,7 +824,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop !9
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP173]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -336,8 +336,9 @@
 
 ; UNROLL-NO-IC-LABEL: @constant_folded_previous_value(
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC:         [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 undef, i64 undef, i64 undef, i64 0>, %vector.ph ], [ <i64 1, i64 1, i64 1, i64 1>, %vector.body ]
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC:         [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 undef, i64 undef, i64 undef, i64 0>, %vector.ph ], [ %broadcast.splat4, %vector.body ]
+; UNROLL-NO-IC-NEXT:    %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0
+; UNROLL-NO-IC-NEXT:    %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC:         br i1 {{.*}}, label %middle.block, label %vector.body
 ;
 define void @constant_folded_previous_value() {
diff --git a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
--- a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
@@ -375,21 +375,18 @@
 
 ; CHECK-LABEL: non_uniform_live_out()
 ; CHECK-LABEL:   vector.body:
-; CHECK:           %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
-; CHECK:           [[ADD:%[a-zA-Z0-9.]+]] = add <2 x i32> %vec.ind, <i32 7, i32 7> 
-; CHECK:           [[EE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 0 
-; CHECK:           [[GEP:%[a-zA-Z0-9.]+]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[EE]]
+; CHECK:           %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:           [[ADD1:%[a-zA-Z0-9.]+]] = add i32 %index, 0
+; CHECK-NEXT:      [[ADD2:%[a-zA-Z0-9.]+]] = add i32 [[ADD1]], 7
+; CHECK:           [[GEP:%[a-zA-Z0-9.]+]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[ADD2]]
 ; CHECK-NEXT:      [[GEP2:%[a-zA-Z0-9.]+]] = getelementptr inbounds i8, i8* [[GEP]], i32 0
 ; CHECK-NEXT:      [[BC:%[a-zA-Z0-9.]+]] = bitcast i8* [[GEP2]] to <2 x i8>*
 ; CHECK-NEXT:      %wide.load = load <2 x i8>, <2 x i8>* [[BC]]
-; CHECK-NEXT:      [[ADD2:%[a-zA-Z0-9.]+]] = add <2 x i8> %wide.load, <i8 1, i8 1> 
-; CHECK:           store <2 x i8> [[ADD2]], <2 x i8>*
-
-; CHECK-LABEL:  middle.block:
-; CHECK:           [[ADDEE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 1
+; CHECK-NEXT:      [[ADD3:%[a-zA-Z0-9.]+]] = add <2 x i8> %wide.load, <i8 1, i8 1> 
+; CHECK:           store <2 x i8> [[ADD3]], <2 x i8>*
 
 ; CHECK-LABEL:  for.end:
-; CHECK:           %lcssa = phi i32 [ %i.09, %for.body ], [ [[ADDEE]], %middle.block ]
+; CHECK:           %lcssa = phi i32 [ %i.09, %for.body ], [ [[ADD2]], %middle.block ]
 ; CHECK:           %arrayidx.out = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %lcssa
 define i32 @non_uniform_live_out() {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll
--- a/llvm/test/Transforms/LoopVectorize/pr32859.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -S | FileCheck --check-prefix=CM %s
+; RUN: opt -force-vector-width=4 < %s -loop-vectorize -S | FileCheck --check-prefix=FORCE %s
 
 ; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]'
 ; but the IR Verifier requires for PHI one entry for each predecessor of
@@ -6,9 +7,27 @@
 ; added 'undef' for an predecessor BB and which is not correct. We copy the real
 ; value for another predecessor instead of bringing 'undef'.
 
-; CHECK-LABEL: for.cond.preheader:
-; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ]
+; FORCE-LABEL: for.cond.preheader:
+; FORCE-NEXT:   %e.0.ph = phi i32 [ 0, %if.end.2.i ]
 
+; Without forcing vectorization, we do not vectorize because we won't generate
+; any vector instructions, besides the loop management code.
+; CM-LABEL: entry:
+; CM-NEXT: br label %for.cond1.preheader.i
+
+; CM-LABEL: for.cond1.preheader.i:
+; CM-NEXT:   %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ]
+; CM-NEXT:   %tobool.i = icmp ne i32 undef, 0
+; CM-NEXT:   br label %if.end.2.i
+
+; CM-LABEL: if.end.2.i:
+; CM-NEXT:   %inc5.i = add nsw i32 %c.06.i, 1
+; CM-NEXT:   %cmp.i = icmp slt i32 %inc5.i, 16
+; CM-NEXT:   br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.preheader
+
+; CM-LABEL: for.cond.preheader:
+; CM-NEXT:   %e.0.ph = phi i32 [ 0, %if.end.2.i ]
+; CM-NEXT:   unreachable
 ; Function Attrs: nounwind uwtable
 define void @main() #0 {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll b/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll
--- a/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll
@@ -1,18 +1,18 @@
 ; RUN: opt -S -loop-vectorize -force-vector-width=4 %s | FileCheck %s
 
-; CHECK-LABEL: @test_fshl
+; CHECK-LABEL: @test_fshl_invariant
 ; CHECK-LABEL: vector.body:
 ; CHECK-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK-NEXT:    %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 ; CHECK-NEXT:    %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    %0 = add i32 %index, 0
-; CHECK-NEXT:    %1 = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i16> <i16 15, i16 15, i16 15, i16 15>)
+; CHECK-NEXT:    %1 = tail call i16 @llvm.fshl.i16(i16 undef, i16 undef, i16 15)
 ; CHECK-NEXT:    %index.next = add i32 %index, 4
 ; CHECK-NEXT:    %2 = icmp eq i32 %index.next, %n.vec
 ; CHECK-NEXT:     br i1 %2, label %middle.block, label %vector.body, !llvm.loop !0
 ;
-define void @test_fshl(i32 %width) {
+define void @test_fshl_invariant(i32 %width) {
 entry:
   br label %for.body9.us.us
 
@@ -28,3 +28,36 @@
 }
 
 declare i16 @llvm.fshl.i16(i16, i16, i16)
+
+; CHECK-LABEL: @test_fshl(
+; CHECK-LABEL: vector.body:
+ ; CHECK-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+; CHECK-NEXT:  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  %8 = add i32 %index, 0
+; CHECK-NEXT:  %9 = getelementptr i16, i16* %A, i32 %8
+; CHECK-NEXT:  %10 = getelementptr i16, i16* %9, i32 0
+; CHECK-NEXT:  %11 = bitcast i16* %10 to <4 x i16>*
+; CHECK-NEXT:  %wide.load = load <4 x i16>, <4 x i16>* %11, align 2
+; CHECK-NEXT:  %12 = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %wide.load, <4 x i16> %wide.load, <4 x i16> <i16 15, i16 15, i16 15, i16 15>)
+; CHECK-NEXT:  %index.next = add i32 %index, 4
+; CHECK-NEXT:  %13 = icmp eq i32 %index.next, %n.vec
+; CHECK-NEXT:  br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4
+
+define void @test_fshl(i32 %width, i16* %A) {
+entry:
+  br label %for.body9.us.us
+
+for.cond6.for.cond.cleanup8_crit_edge.us.us:      ; preds = %for.body9.us.us
+  ret void
+
+for.body9.us.us:                                  ; preds = %for.body9.us.us, %entry
+  %x.020.us.us = phi i32 [ 0, %entry ], [ %inc.us.us, %for.body9.us.us ]
+  %A.ptr = getelementptr i16, i16* %A, i32 %x.020.us.us
+  %a = load i16, i16* %A.ptr
+  %conv4.i.us.us = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 15)
+  %inc.us.us = add nuw i32 %x.020.us.us, 1
+  %exitcond50 = icmp eq i32 %inc.us.us, %width
+  br i1 %exitcond50, label %for.cond6.for.cond.cleanup8_crit_edge.us.us, label %for.body9.us.us
+}