Index: llvm/trunk/include/llvm/Transforms/Vectorize/SLPVectorizer.h
===================================================================
--- llvm/trunk/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ llvm/trunk/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -95,9 +95,11 @@
   bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
 
   /// \brief Try to vectorize a list of operands.
+  /// \param UserCost Cost of the user operations of \p VL if they may affect
+  /// the cost of the vectorization.
   /// \returns true if a value was vectorized.
   bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
-                          bool AllowReorder = false);
+                          int UserCost = 0, bool AllowReorder = false);
 
   /// \brief Try to vectorize a chain that may start at the operands of \p I.
   bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);
Index: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4702,11 +4702,11 @@
   if (!A || !B)
     return false;
   Value *VL[] = { A, B };
-  return tryToVectorizeList(VL, R, true);
+  return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
 }
 
 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                                           bool AllowReorder) {
+                                           int UserCost, bool AllowReorder) {
   if (VL.size() < 2)
     return false;
 
@@ -4815,7 +4815,7 @@
         continue;
 
       R.computeMinimumValueSizes();
-      int Cost = R.getTreeCost();
+      int Cost = R.getTreeCost() - UserCost;
       CandidateFound = true;
       MinCost = std::min(MinCost, Cost);
 
@@ -5748,9 +5748,17 @@
 ///
 /// Returns true if it matches
 static bool findBuildVector(InsertElementInst *LastInsertElem,
-                            SmallVectorImpl<Value *> &BuildVectorOpds) {
+                            TargetTransformInfo *TTI,
+                            SmallVectorImpl<Value *> &BuildVectorOpds,
+                            int &UserCost) {
+  UserCost = 0;
   Value *V = nullptr;
   do {
+    if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {
+      UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
+                                          LastInsertElem->getType(),
+                                          CI->getZExtValue());
+    }
     BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
     V = LastInsertElem->getOperand(0);
     if (isa<UndefValue>(V))
@@ -5965,13 +5973,17 @@
 
 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                    BasicBlock *BB, BoUpSLP &R) {
+  int UserCost;
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildVector(IEI, BuildVectorOpds))
+  if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||
+      (llvm::all_of(BuildVectorOpds,
+                    [](Value *V) { return isa<ExtractElementInst>(V); }) &&
+       isShuffle(BuildVectorOpds)))
     return false;
 
   // Vectorize starting with the build vector operands ignoring the BuildVector
   // instructions for the purpose of scheduling and user extraction.
-  return tryToVectorizeList(BuildVectorOpds, R);
+  return tryToVectorizeList(BuildVectorOpds, R, UserCost);
 }
 
 bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -6049,8 +6061,8 @@
       // is done when there are exactly two elements since tryToVectorizeList
       // asserts that there are only two values when AllowReorder is true.
       bool AllowReorder = NumElts == 2;
-      if (NumElts > 1 &&
-          tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
+      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
+                                            /*UserCost=*/0, AllowReorder)) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
Index: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
===================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -31,50 +31,54 @@
 ;
 ; GATHER-LABEL: @PR28330(
 ; GATHER-NEXT:  entry:
-; GATHER-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
-; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
-; GATHER-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; GATHER-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
-; GATHER-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; GATHER-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
-; GATHER-NEXT:    [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; GATHER-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0
-; GATHER-NEXT:    [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; GATHER-NEXT:    [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0
-; GATHER-NEXT:    [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; GATHER-NEXT:    [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0
-; GATHER-NEXT:    [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; GATHER-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
+; GATHER-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
 ; GATHER:       for.body:
-; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; GATHER-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> <i32 -720, i32 -720>, <2 x i32> <i32 -80, i32 -80>
-; GATHER-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; GATHER-NEXT:    [[TMP20:%.*]] = add i32 [[TMP17]], [[TMP3]]
-; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]]
-; GATHER-NEXT:    [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
-; GATHER-NEXT:    [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]]
-; GATHER-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
-; GATHER-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
-; GATHER-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]]
-; GATHER-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
-; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1
-; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2
-; GATHER-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3
-; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4
-; GATHER-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5
-; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6
-; GATHER-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]])
-; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP13]], [[TMP17]]
-; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
+; GATHER-NEXT:    [[TMPP17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; GATHER-NEXT:    [[TMP3:%.*]] = insertelement <8 x i1> undef, i1 [[TMP2]], i32 0
+; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
+; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i1> [[TMP3]], i1 [[TMP4]], i32 1
+; GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
+; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i1> [[TMP5]], i1 [[TMP6]], i32 2
+; GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
+; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i1> [[TMP7]], i1 [[TMP8]], i32 3
+; GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
+; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i1> [[TMP9]], i1 [[TMP10]], i32 4
+; GATHER-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
+; GATHER-NEXT:    [[TMP13:%.*]] = insertelement <8 x i1> [[TMP11]], i1 [[TMP12]], i32 5
+; GATHER-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
+; GATHER-NEXT:    [[TMP15:%.*]] = insertelement <8 x i1> [[TMP13]], i1 [[TMP14]], i32 6
+; GATHER-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
+; GATHER-NEXT:    [[TMP17:%.*]] = insertelement <8 x i1> [[TMP15]], i1 [[TMP16]], i32 7
+; GATHER-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; GATHER-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0
+; GATHER-NEXT:    [[TMPP20:%.*]] = add i32 [[TMPP17]], [[TMP19]]
+; GATHER-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1
+; GATHER-NEXT:    [[TMPP22:%.*]] = add i32 [[TMPP20]], [[TMP20]]
+; GATHER-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2
+; GATHER-NEXT:    [[TMPP24:%.*]] = add i32 [[TMPP22]], [[TMP21]]
+; GATHER-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3
+; GATHER-NEXT:    [[TMPP26:%.*]] = add i32 [[TMPP24]], [[TMP22]]
+; GATHER-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4
+; GATHER-NEXT:    [[TMPP28:%.*]] = add i32 [[TMPP26]], [[TMP23]]
+; GATHER-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5
+; GATHER-NEXT:    [[TMPP30:%.*]] = add i32 [[TMPP28]], [[TMP24]]
+; GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6
+; GATHER-NEXT:    [[TMPP32:%.*]] = add i32 [[TMPP30]], [[TMP25]]
+; GATHER-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
+; GATHER-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1
+; GATHER-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2
+; GATHER-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3
+; GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4
+; GATHER-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5
+; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
+; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
+; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
+; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP34]])
+; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], [[TMPP17]]
+; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMPP32]], [[TMP33]]
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
 ; MAX-COST-LABEL: @PR28330(
@@ -179,50 +183,54 @@
 ;
 ; GATHER-LABEL: @PR32038(
 ; GATHER-NEXT:  entry:
-; GATHER-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
-; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
-; GATHER-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; GATHER-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
-; GATHER-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; GATHER-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
-; GATHER-NEXT:    [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; GATHER-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0
-; GATHER-NEXT:    [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; GATHER-NEXT:    [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0
-; GATHER-NEXT:    [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; GATHER-NEXT:    [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0
-; GATHER-NEXT:    [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; GATHER-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
+; GATHER-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
 ; GATHER:       for.body:
-; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; GATHER-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> <i32 -720, i32 -720>, <2 x i32> <i32 -80, i32 -80>
-; GATHER-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; GATHER-NEXT:    [[TMP20:%.*]] = add i32 -5, [[TMP3]]
-; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]]
-; GATHER-NEXT:    [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
-; GATHER-NEXT:    [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]]
-; GATHER-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
-; GATHER-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
-; GATHER-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]]
-; GATHER-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
-; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1
-; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2
-; GATHER-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3
-; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4
-; GATHER-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5
-; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6
-; GATHER-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]])
-; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP13]], -5
-; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
+; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; GATHER-NEXT:    [[TMP3:%.*]] = insertelement <8 x i1> undef, i1 [[TMP2]], i32 0
+; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
+; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i1> [[TMP3]], i1 [[TMP4]], i32 1
+; GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
+; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i1> [[TMP5]], i1 [[TMP6]], i32 2
+; GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
+; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i1> [[TMP7]], i1 [[TMP8]], i32 3
+; GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
+; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i1> [[TMP9]], i1 [[TMP10]], i32 4
+; GATHER-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
+; GATHER-NEXT:    [[TMP13:%.*]] = insertelement <8 x i1> [[TMP11]], i1 [[TMP12]], i32 5
+; GATHER-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
+; GATHER-NEXT:    [[TMP15:%.*]] = insertelement <8 x i1> [[TMP13]], i1 [[TMP14]], i32 6
+; GATHER-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
+; GATHER-NEXT:    [[TMP17:%.*]] = insertelement <8 x i1> [[TMP15]], i1 [[TMP16]], i32 7
+; GATHER-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; GATHER-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0
+; GATHER-NEXT:    [[TMPP20:%.*]] = add i32 -5, [[TMP19]]
+; GATHER-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1
+; GATHER-NEXT:    [[TMPP22:%.*]] = add i32 [[TMPP20]], [[TMP20]]
+; GATHER-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2
+; GATHER-NEXT:    [[TMPP24:%.*]] = add i32 [[TMPP22]], [[TMP21]]
+; GATHER-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3
+; GATHER-NEXT:    [[TMPP26:%.*]] = add i32 [[TMPP24]], [[TMP22]]
+; GATHER-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4
+; GATHER-NEXT:    [[TMPP28:%.*]] = add i32 [[TMPP26]], [[TMP23]]
+; GATHER-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5
+; GATHER-NEXT:    [[TMPP30:%.*]] = add i32 [[TMPP28]], [[TMP24]]
+; GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6
+; GATHER-NEXT:    [[TMPP32:%.*]] = add i32 [[TMPP30]], [[TMP25]]
+; GATHER-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
+; GATHER-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1
+; GATHER-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2
+; GATHER-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3
+; GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4
+; GATHER-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5
+; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
+; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
+; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
+; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP34]])
+; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], -5
+; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMPP32]], [[TMP33]]
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
 ; MAX-COST-LABEL: @PR32038(
Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
===================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -387,14 +387,14 @@
 ; to do this backwards this backwards
 define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
 ; CHECK-LABEL: @reconstruct(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[C]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[C]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[C]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP3]], i32 1
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP2]], i32 2
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP1]], i32 3
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[C0]], i32 0
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[C1]], i32 1
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[C2]], i32 2
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[C3]], i32 3
 ; CHECK-NEXT:    ret <4 x i32> [[RD]]
 ;
 ; ZEROTHRESH-LABEL: @reconstruct(
Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
===================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
@@ -12,19 +12,20 @@
 ; CHECK-LABEL: @fn1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 2), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 8, i32 3
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1) to <2 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 8, i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP12]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   entry:
Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/sign-extend.ll
===================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/sign-extend.ll
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/sign-extend.ll
@@ -34,18 +34,15 @@
 define <4 x i16> @truncate_v_v(<4 x i32> %lhs) {
 ; CHECK-LABEL: @truncate_v_v(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[LHS:%.*]], i32 0
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[VECEXT]] to i16
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i16> undef, i16 [[CONV]], i32 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i32> [[LHS]], i32 1
-; CHECK-NEXT:    [[CONV2:%.*]] = trunc i32 [[VECEXT1]] to i16
-; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[CONV2]], i32 1
-; CHECK-NEXT:    [[VECEXT4:%.*]] = extractelement <4 x i32> [[LHS]], i32 2
-; CHECK-NEXT:    [[CONV5:%.*]] = trunc i32 [[VECEXT4]] to i16
-; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[CONV5]], i32 2
-; CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x i32> [[LHS]], i32 3
-; CHECK-NEXT:    [[CONV8:%.*]] = trunc i32 [[VECEXT7]] to i16
-; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[CONV8]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[LHS:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i16> undef, i16 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2
+; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3
+; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[TMP4]], i32 3
 ; CHECK-NEXT:    ret <4 x i16> [[VECINIT9]]
 ;
 entry:
Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/value-bug.ll
===================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/value-bug.ll
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/value-bug.ll
@@ -11,34 +11,32 @@
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  bb279:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> undef, float undef, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float undef, i32 1
 ; CHECK-NEXT:    br label [[BB283:%.*]]
 ; CHECK:       bb283:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP11:%.*]], [[EXIT:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP13:%.*]], [[EXIT]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP1]], [[EXIT]] ]
 ; CHECK-NEXT:    br label [[BB284:%.*]]
 ; CHECK:       bb284:
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], undef
 ; CHECK-NEXT:    br label [[BB21_I:%.*]]
 ; CHECK:       bb21.i:
 ; CHECK-NEXT:    br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
 ; CHECK:       bb22.i:
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> undef, [[TMP6]]
 ; CHECK-NEXT:    br label [[BB32_I:%.*]]
 ; CHECK:       bb32.i:
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x double> [ [[TMP7]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
 ; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> <double undef, double 0.000000e+00>, [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> undef, [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> undef, [[TMP9]]
-; CHECK-NEXT:    [[TMP11]] = fptrunc <2 x double> [[TMP10]] to <2 x float>
-; CHECK-NEXT:    [[TMP317:%.*]] = fptrunc double undef to float
-; CHECK-NEXT:    [[TMP319:%.*]] = fptrunc double undef to float
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP317]], i32 0
-; CHECK-NEXT:    [[TMP13]] = insertelement <2 x float> [[TMP12]], float [[TMP319]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> <double undef, double 0.000000e+00>, [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> undef, [[TMP11]]
+; CHECK-NEXT:    [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float>
 ; CHECK-NEXT:    br label [[BB283]]
 ;
 bb279: