diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3190,6 +3190,75 @@
   return Cost;
 }
 
+static bool isLoadOrFreeCast(X86TTIImpl *TTI, const Value *V,
+                             TTI::TargetCostKind CostKind) {
+  if (isa<LoadInst>(V))
+    return true;
+  const auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+  switch (I->getOpcode()) {
+  case Instruction::Trunc:
+  case Instruction::BitCast:
+  case Instruction::PtrToInt:
+    // Check if the cast of the load is free if the cast operation is free.
+    return I->hasOneUse() &&
+           TTI->getCastInstrCost(
+               I->getOpcode(), I->getType(), I->getOperand(0)->getType(),
+               TTI::getCastContextHint(I), CostKind, I) == 0 &&
+           isa<LoadInst>(I->getOperand(0)) &&
+           I->getParent() == cast<Instruction>(I->getOperand(0))->getParent();
+  default:
+    break;
+  }
+  return false;
+}
+
+static bool isFreeOp(X86TTIImpl *TTI, const Instruction *I, const Type *Ty,
+                     TTI::TargetCostKind CostKind, const Instruction *OpI) {
+  switch (I->getOpcode()) {
+  case Instruction::FAdd:
+  case Instruction::FMul:
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    bool IsLoadOrFreeCast0 = isLoadOrFreeCast(TTI, I->getOperand(0), CostKind);
+    bool IsLoadOrFreeCast1 = isLoadOrFreeCast(TTI, I->getOperand(1), CostKind);
+    bool IsNotConstant0 = !isa<Constant>(I->getOperand(0));
+    bool IsNotConstant1 = !isa<Constant>(I->getOperand(1));
+    bool IsSingleUseOp0 = I->getOperand(0)->hasOneUse();
+    bool IsSingleUseOp1 = I->getOperand(1)->hasOneUse();
+    bool SameParentOp0 =
+        isa<Instruction>(I->getOperand(0)) &&
+        cast<Instruction>(I->getOperand(0))->getParent() == OpI->getParent();
+    bool SameParentOp1 =
+        isa<Instruction>(I->getOperand(1)) &&
+        cast<Instruction>(I->getOperand(1))->getParent() == OpI->getParent();
+    return (I->getOperand(1) == OpI && !IsLoadOrFreeCast0 && IsSingleUseOp0 &&
+            IsNotConstant0 && SameParentOp0) ||
+           (I->getOperand(0) == OpI && !IsLoadOrFreeCast1 && IsSingleUseOp1 &&
+            IsNotConstant1 && SameParentOp1 &&
+            (!isa<FPMathOperator>(I) || I->hasAllowReassoc())) ||
+           (IsLoadOrFreeCast0 && IsLoadOrFreeCast1 &&
+            (((!IsSingleUseOp1 || !SameParentOp1) && I->getOperand(1) == OpI) ||
+             ((!IsSingleUseOp0 || !SameParentOp0) && I->getOperand(0) == OpI &&
+              (!isa<FPMathOperator>(I) || I->hasAllowReassoc())))) ||
+           I->getOperand(1) == OpI;
+  }
+  case Instruction::Sub:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FSub:
+  case Instruction::FDiv:
+    return I->getOperand(1) == OpI;
+  }
+  return false;
+}
+
 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                 MaybeAlign Alignment, unsigned AddressSpace,
                                 TTI::TargetCostKind CostKind,
@@ -3252,6 +3321,33 @@
   if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
     Cost *= 2;
 
+  if (Opcode == Instruction::Load && Cost == LT.first && !ST->isSLM() &&
+      isa_and_nonnull<LoadInst>(I) && I->hasOneUse()) {
+    // TODO: Include SLM when the cost model is correct.
+    // Check if the cost of the scalar load can be considered as 0. We can
+    // consider it 0, if arithmetic|logic|compare|cast instruction allows to use
+    // memory address as one of its argument.
+    const Instruction *UI = I->user_back();
+    if (UI->getParent() == I->getParent() &&
+        isFreeOp(this, UI, UI->getType(), CostKind, I))
+      return 0;
+    switch (UI->getOpcode()) {
+    case Instruction::Trunc:
+    case Instruction::BitCast:
+    case Instruction::PtrToInt:
+      // Check if the cast of the load is free if the cast operation is free.
+      if (UI->user_back()->getParent() == UI->getParent() && UI->hasOneUse() &&
+          getCastInstrCost(UI->getOpcode(), UI->getType(),
+                           UI->getOperand(0)->getType(),
+                           TTI::getCastContextHint(UI), CostKind, UI) == 0 &&
+          isFreeOp(this, UI->user_back(), UI->getType(), CostKind, UI))
+        return 0;
+      break;
+    default:
+      break;
+    }
+  }
+
   return Cost;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3764,22 +3764,33 @@
     }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
-      Align alignment = cast<LoadInst>(VL0)->getAlign();
-      InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
-          Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0);
+      const auto *LI = cast<LoadInst>(VL0);
+      Align Alignment = LI->getAlign();
+      InstructionCost ScalarLdCost = 0;
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        for (const unsigned I : E->ReuseShuffleIndices) {
+          const auto *LD = cast<LoadInst>(VL[I]);
+          ReuseShuffleCost -=
+              TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment,
+                                   LD->getPointerAddressSpace(), CostKind, LD);
+        }
+      } else {
+        for (Value *V : VL) {
+          const auto *LD = cast<LoadInst>(V);
+          ScalarLdCost +=
+              TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment,
+                                   LD->getPointerAddressSpace(), CostKind, LD);
+        }
       }
-      InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
       InstructionCost VecLdCost;
       if (E->State == TreeEntry::Vectorize) {
-        VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
+        VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0,
                                          CostKind, VL0);
       } else {
         assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
         VecLdCost = TTI->getGatherScatterOpCost(
             Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
-            /*VariableMask=*/false, alignment, CostKind, VL0);
+            /*VariableMask=*/false, Alignment, CostKind, VL0);
       }
       if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) {
         SmallVector<int> NewMask;
diff --git a/llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll b/llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll
--- a/llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll
@@ -10,7 +10,7 @@
 ; Function Attrs: nounwind uwtable
 define void @load_i32_interleave4() {
 ;CHECK-LABEL: load_i32_interleave4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 0 for VF 1 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 5 for VF 2 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %0 = load
@@ -46,7 +46,7 @@
 
 define void @load_i32_interleave5() {
 ;CHECK-LABEL: load_i32_interleave5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 0 for VF 1 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 6 for VF 2 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 9 for VF 4 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 18 for VF 8 For instruction:   %0 = load
diff --git a/llvm/test/Analysis/CostModel/X86/vectorized-loop.ll b/llvm/test/Analysis/CostModel/X86/vectorized-loop.ll
--- a/llvm/test/Analysis/CostModel/X86/vectorized-loop.ll
+++ b/llvm/test/Analysis/CostModel/X86/vectorized-loop.ll
@@ -58,7 +58,7 @@
   ;CHECK: cost of 1 {{.*}} mul
   %mul = mul nsw i32 %14, 5
   %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  ;CHECK: cost of 1 {{.*}} load
+  ;CHECK: cost of 0 {{.*}} load
   %15 = load i32, i32* %arrayidx2, align 4
   %add3 = add nsw i32 %15, %mul
   store i32 %add3, i32* %arrayidx2, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=SSE
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=sandybridge < %s | FileCheck %s --check-prefix=AVX
-; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefix=AVX
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefix=AVX2
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SSE
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=SSE
 
@@ -28,32 +28,52 @@
 ;
 ; AVX-LABEL: @foo(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; AVX:       vector.ph:
-; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
-; AVX:       vector.body:
-; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; AVX-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; AVX-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
-; AVX-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
-; AVX-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; AVX-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; AVX-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
-; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; AVX-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; AVX-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
-; AVX:       middle.block:
-; AVX-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; AVX:       scalar.ph:
 ; AVX-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX:       for.cond.cleanup:
 ; AVX-NEXT:    ret void
 ; AVX:       for.body:
-; AVX-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
+; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; AVX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; AVX-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP3]], [[TMP1]]
+; AVX-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    store i32 [[ADD4]], i32* [[ARRAYIDX6]], align 4
+; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+;
+; AVX2-LABEL: @foo(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX2:       vector.ph:
+; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; AVX2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
+; AVX2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX2-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; AVX2-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; AVX2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX2-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; AVX2:       middle.block:
+; AVX2-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; AVX2:       scalar.ph:
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.cond.cleanup:
+; AVX2-NEXT:    ret void
+; AVX2:       for.body:
+; AVX2-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -4,7 +4,7 @@
 ; Two cases tested AVX (MaxVF=8 = TripCount) and AVX512 (MaxVF=16 > TripCount)
 
 ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
-; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=skylake-avx512 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=skylake-avx512 -S | FileCheck %s --check-prefix=AVX512
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -17,22 +17,22 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
 ; CHECK:       middle.block:
@@ -55,6 +55,51 @@
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
+; AVX512-LABEL: @small_tc(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX512:       vector.ph:
+; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
+; AVX512-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; AVX512-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; AVX512-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0
+; AVX512-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; AVX512-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; AVX512-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; AVX512-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
+; AVX512-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
+; AVX512:       middle.block:
+; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 8, 8
+; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; AVX512:       scalar.ph:
+; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0
+; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
+; AVX512-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
+; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP3:!llvm.loop !.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -27,23 +27,23 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 16
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 20
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -166,22 +166,22 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !llvm.access.group !6
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !6
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
@@ -248,16 +248,18 @@
 define float @fadd_v4i32(float* %p) #0 {
 ; CHECK-LABEL: @fadd_v4i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7:!tbaa !.*]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[BIN_RDX5:%.*]] = fadd fast float -0.000000e+00, [[TMP2]]
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[BIN_RDX5]], 4.200000e+01
-; CHECK-NEXT:    ret float [[OP_EXTRA]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, [[TBAA7:!tbaa !.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], 4.200000e+01
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]]
+; CHECK-NEXT:    ret float [[ADD_3]]
 ;
 entry:
   br label %for.cond
@@ -289,16 +291,18 @@
 define float @fmul_v4i32(float* %p) #0 {
 ; CHECK-LABEL: @fmul_v4i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fmul fast <4 x float> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[BIN_RDX5:%.*]] = fmul fast float 1.000000e+00, [[TMP2]]
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fmul fast float [[BIN_RDX5]], 4.200000e+01
-; CHECK-NEXT:    ret float [[OP_EXTRA]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP0]], 4.200000e+01
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP1]], [[MUL]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP2]], [[MUL_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP3]], [[MUL_2]]
+; CHECK-NEXT:    ret float [[MUL_3]]
 ;
 entry:
   br label %for.cond
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -126,13 +126,28 @@
 define i32 @TestVectorsEqual_alt(i32* noalias %Vec0, i32* noalias %Vec1, i32 %Tolerance) {
 ; CHECK-LABEL: @TestVectorsEqual_alt(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[VEC0:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[VEC1:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[VEC0:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VEC1:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[VEC0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[VEC1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[VEC0]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[VEC1]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[VEC0]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[VEC1]], i64 3
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ule i32 [[ADD_3]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP3_NOT]] to i32
 ; CHECK-NEXT:    ret i32 [[COND]]
 ;
@@ -233,13 +248,28 @@
 define i32 @TestVectorsEqualFP_alt(float* noalias %Vec0, float* noalias %Vec1, float %Tolerance) {
 ; CHECK-LABEL: @TestVectorsEqualFP_alt(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[VEC0:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[VEC1:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
-; CHECK-NEXT:    [[CMP3:%.*]] = fcmp fast ole float [[TMP5]], [[TOLERANCE:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[VEC0:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[VEC1:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[VEC0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, float* [[VEC1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[VEC0]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds float, float* [[VEC1]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast float [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[VEC0]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds float, float* [[VEC1]], i64 3
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast float [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fsub fast float [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp fast ole float [[ADD_3]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP3]] to i32
 ; CHECK-NEXT:    ret i32 [[COND]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=i386-apple-macosx10.9.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=i386-apple-macosx10.9.0 -mcpu=corei7-avx -slp-threshold=-1 | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -791,11 +791,13 @@
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
 ; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
@@ -804,8 +806,8 @@
 ; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
 ; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
 ; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
 ; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
 ; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
@@ -822,16 +824,18 @@
 ; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
 ; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
 ; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP5]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
-; CHECK-NEXT:    ret float [[TMP12]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x float>, <16 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP7]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[OP_RDX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast float [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast float [[TMP13]], [[TMP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd fast float [[TMP14]], [[TMP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fadd fast float [[TMP15]], [[TMP1]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd fast float [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    ret float [[TMP17]]
 ;
 ; THRESHOLD-LABEL: @loadadd31(
 ; THRESHOLD-NEXT:  entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -153,24 +153,22 @@
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 1
 ; CHECK-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 2
 ; CHECK-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
-; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP14]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, i64* [[P0]], align 2
+; CHECK-NEXT:    [[X1:%.*]] = load i64, i64* [[P1]], align 2
+; CHECK-NEXT:    [[X2:%.*]] = load i64, i64* [[P2]], align 2
+; CHECK-NEXT:    [[X3:%.*]] = load i64, i64* [[P3]], align 2
+; CHECK-NEXT:    [[Y0:%.*]] = load i64, i64* [[Q0]], align 2
+; CHECK-NEXT:    [[Y1:%.*]] = load i64, i64* [[Q1]], align 2
+; CHECK-NEXT:    [[Y2:%.*]] = load i64, i64* [[Q2]], align 2
+; CHECK-NEXT:    [[Y3:%.*]] = load i64, i64* [[Q3]], align 2
+; CHECK-NEXT:    [[SUB0:%.*]] = sub nsw i64 [[X0]], [[Y0]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i64 [[X1]], [[Y1]]
+; CHECK-NEXT:    [[SUB2:%.*]] = sub nsw i64 [[X2]], [[Y2]]
+; CHECK-NEXT:    [[SUB3:%.*]] = sub nsw i64 [[X3]], [[Y3]]
+; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[SUB0]]
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB1]]
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB2]]
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB3]]
 ; CHECK-NEXT:    ret void
 ;
   %p0 = getelementptr inbounds i64, i64* %p, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -153,24 +153,22 @@
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 1
 ; CHECK-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 2
 ; CHECK-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
-; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP14]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, i64* [[P0]], align 2
+; CHECK-NEXT:    [[X1:%.*]] = load i64, i64* [[P1]], align 2
+; CHECK-NEXT:    [[X2:%.*]] = load i64, i64* [[P2]], align 2
+; CHECK-NEXT:    [[X3:%.*]] = load i64, i64* [[P3]], align 2
+; CHECK-NEXT:    [[Y0:%.*]] = load i64, i64* [[Q0]], align 2
+; CHECK-NEXT:    [[Y1:%.*]] = load i64, i64* [[Q1]], align 2
+; CHECK-NEXT:    [[Y2:%.*]] = load i64, i64* [[Q2]], align 2
+; CHECK-NEXT:    [[Y3:%.*]] = load i64, i64* [[Q3]], align 2
+; CHECK-NEXT:    [[SUB0:%.*]] = sub nsw i64 [[X0]], [[Y0]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i64 [[X1]], [[Y1]]
+; CHECK-NEXT:    [[SUB2:%.*]] = sub nsw i64 [[X2]], [[Y2]]
+; CHECK-NEXT:    [[SUB3:%.*]] = sub nsw i64 [[X3]], [[Y3]]
+; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[SUB0]]
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB1]]
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB2]]
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB3]]
 ; CHECK-NEXT:    ret void
 ;
   %p0 = getelementptr inbounds i64, i64* %p, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -319,36 +319,34 @@
 ; CHECK-LABEL: @lookahead_limit_users_budget(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
+; CHECK-NEXT:    [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double*> poison, double* [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[B]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> <i64 0, i64 2>
+; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
+; CHECK-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> undef)
+; CHECK-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
+; CHECK-NEXT:    [[B2:%.*]] = load double, double* [[IDXB2]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = load double, double* [[IDXA2]], align 8
 ; CHECK-NEXT:    [[B1:%.*]] = load double, double* [[IDXB1]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <2 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[B1]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP11]]
+; CHECK-NEXT:    [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]]
+; CHECK-NEXT:    [[SUBC0D0:%.*]] = fsub fast double [[C0]], [[D0]]
+; CHECK-NEXT:    [[SUBA1B2:%.*]] = fsub fast double [[A1]], [[B2]]
+; CHECK-NEXT:    [[SUBA2B1:%.*]] = fsub fast double [[A2]], [[B1]]
+; CHECK-NEXT:    [[ADD0:%.*]] = fadd fast double [[SUBA0B0]], [[SUBC0D0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast double [[SUBA1B2]], [[SUBA2B1]]
 ; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
-; CHECK-NEXT:    store double [[TMP14]], double* [[EXT1:%.*]], align 8
-; CHECK-NEXT:    store double [[TMP14]], double* [[EXT2:%.*]], align 8
-; CHECK-NEXT:    store double [[TMP14]], double* [[EXT3:%.*]], align 8
+; CHECK-NEXT:    store double [[ADD0]], double* [[IDXS0]], align 8
+; CHECK-NEXT:    store double [[ADD1]], double* [[IDXS1]], align 8
+; CHECK-NEXT:    store double [[A1]], double* [[EXT1:%.*]], align 8
+; CHECK-NEXT:    store double [[A1]], double* [[EXT2:%.*]], align 8
+; CHECK-NEXT:    store double [[A1]], double* [[EXT3:%.*]], align 8
 ; CHECK-NEXT:    store double [[B1]], double* [[EXT4:%.*]], align 8
 ; CHECK-NEXT:    store double [[B1]], double* [[EXT5:%.*]], align 8
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -34,17 +34,16 @@
 ; AVX-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
 ; AVX-NEXT:    store i64 [[OR_1]], i64* undef, align 8
 ; AVX-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
+; AVX-NEXT:    [[FOO_3:%.*]] = load i64, i64* [[FOO_1]], align 8
 ; AVX-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
-; AVX-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
-; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; AVX-NEXT:    [[FOO_4:%.*]] = load i64, i64* [[FOO_2]], align 8
 ; AVX-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
-; AVX-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
+; AVX-NEXT:    [[AND_2:%.*]] = and i64 [[OR_1]], [[FOO_3]]
+; AVX-NEXT:    [[AND_1:%.*]] = and i64 [[BAR5]], [[FOO_4]]
 ; AVX-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+; AVX-NEXT:    store i64 [[AND_2]], i64* [[BAR3]], align 8
 ; AVX-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
-; AVX-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
-; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; AVX-NEXT:    store i64 [[AND_1]], i64* [[BAR4]], align 8
 ; AVX-NEXT:    ret void
 ;
 for.body.lr.ph.i:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -63,45 +63,31 @@
 ; }
 
 define i32 @test_mul(i32* nocapture readonly %p) {
-; AVX-LABEL: @test_mul(
-; AVX-NEXT:  entry:
-; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; AVX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]])
-; AVX-NEXT:    ret i32 [[TMP2]]
-;
-; SSE-LABEL: @test_mul(
-; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
-; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; SSE-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
-; SSE-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
-; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; SSE-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
-; SSE-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
-; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
-; SSE-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
-; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
-; SSE-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
-; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; SSE-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
-; SSE-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
-; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; SSE-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
-; SSE-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
-; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
-; SSE-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
-; SSE-NEXT:    ret i32 [[MUL_714]]
+; CHECK-LABEL: @test_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[MUL_714]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -106,7 +106,7 @@
   ; YAML-NEXT: Function:        foo
   ; YAML-NEXT: Args:
   ; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
-  ; YAML-NEXT:   - Cost:            '-5'
+  ; YAML-NEXT:   - Cost:            '-2'
   ; YAML-NEXT:   - String:          ' and with tree size '
   ; YAML-NEXT:   - TreeSize:        '4'