diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2884,36 +2884,43 @@
     case Instruction::InsertElement: {
       assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
 
-      int Offset = *getInsertIndex(VL[0], 0);
-      ValueList Operands(VL.size());
-      Operands[0] = cast<Instruction>(VL[0])->getOperand(1);
-      bool IsConsecutive = true;
-      for (unsigned I = 1, E = VL.size(); I < E; ++I) {
-        IsConsecutive &= (I == *getInsertIndex(VL[I], 0) - Offset);
-        Operands[I] = cast<Instruction>(VL[I])->getOperand(1);
+      // Check that we have a buildvector and not a shuffle of 2 or more
+      // different vectors.
+      int NotUsedVectors = 0;
+      ValueList Operands;
+      ValueSet SourceVectors;
+      for (Value *V : VL)
+        SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
+
+      for (Value *V : VL) {
+        auto *I = cast<Instruction>(V);
+        if (!SourceVectors.contains(I)) {
+          ++NotUsedVectors;
+          if (NotUsedVectors >= 2) {
+            // Found 2nd source vector - cancel.
+            LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
+                                 "different source vectors.\n");
+            newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                         ReuseShuffleIndicies);
+            BS.cancelScheduling(VL, VL0);
+            return;
+          }
+        }
+        Operands.push_back(I->getOperand(1));
       }
 
-      // FIXME: support vectorization of non-consecutive inserts
-      // using shuffle with its proper cost estimation
-      if (IsConsecutive) {
-        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx);
-        LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx);
+      LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
 
-        TE->setOperand(0, Operands);
+      TE->setOperand(0, Operands);
 
-        ValueList VectorOperands;
-        for (Value *V : VL)
-          VectorOperands.push_back(cast<Instruction>(V)->getOperand(0));
+      ValueList VectorOperands;
+      for (Value *V : VL)
+        VectorOperands.push_back(cast<Instruction>(V)->getOperand(0));
 
-        TE->setOperand(1, VectorOperands);
+      TE->setOperand(1, VectorOperands);
 
-        buildTree_rec(Operands, Depth + 1, {TE, 0});
-        return;
-      }
-
-      LLVM_DEBUG(dbgs() << "SLP: skipping non-consecutive inserts.\n");
-      BS.cancelScheduling(VL, VL0);
-      buildTree_rec(Operands, Depth, UserTreeIdx);
+      buildTree_rec(Operands, Depth + 1, {TE, 0});
       return;
     }
     case Instruction::Load: {
@@ -3806,21 +3813,40 @@
       auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
 
       unsigned const NumElts = SrcVecTy->getNumElements();
+      unsigned const NumScalars = VL.size();
       APInt DemandedElts = APInt::getNullValue(NumElts);
-      for (auto *V : VL)
-        DemandedElts.setBit(*getInsertIndex(V, 0));
+      // TODO: Add support for Instruction::InsertValue.
+      unsigned Offset = UINT_MAX;
+      bool IsIdentity = true;
+      SmallVector<int> ShuffleMask(NumElts, UndefMaskElem);
+      for (unsigned I = 0; I < NumScalars; ++I) {
+        Value *V = VL[I];
+        Optional<unsigned> InsertIdx = getInsertIndex(V, 0);
+        if (!InsertIdx)
+          continue;
+        unsigned Idx = *InsertIdx;
+        DemandedElts.setBit(Idx);
+        if (Idx < Offset) {
+          Offset = Idx;
+          IsIdentity &= I == 0;
+        } else {
+          IsIdentity &= Idx - Offset == I;
+        }
+        ShuffleMask[Idx] = I;
+      }
 
       InstructionCost Cost = 0;
       Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
                                             /*Insert*/ true, /*Extract*/ false);
 
-      unsigned const NumScalars = VL.size();
-      unsigned const Offset = *getInsertIndex(VL[0], 0);
-      if (NumElts != NumScalars && Offset % NumScalars != 0)
+      if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0)
         Cost += TTI->getShuffleCost(
             TargetTransformInfo::SK_InsertSubvector, SrcVecTy, /*Mask*/ None,
             Offset,
             FixedVectorType::get(SrcVecTy->getElementType(), NumScalars));
+      else if (!IsIdentity)
+        Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy,
+                                    ShuffleMask);
 
       return Cost;
     }
@@ -4350,8 +4376,31 @@
                       << "SLP: Current total cost = " << Cost << "\n");
   }
 
+  // Checks if 2 insertelements are from the same buildvector.
+  auto &&AreFromSingleVector = [](Value *V1, Value *V2) {
+    if (V1->getType() != V2->getType())
+      return false;
+    Instruction *IE1 = cast<InsertElementInst>(V1);
+    Instruction *IE2 = cast<InsertElementInst>(V2);
+
+    do {
+      if (IE1 == V2 || IE2 == V1)
+        return true;
+      if (IE1)
+        IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
+      if (IE2)
+        IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
+    } while (IE1 || IE2);
+    return false;
+  };
+
   SmallPtrSet<Value *, 16> ExtractCostCalculated;
   InstructionCost ExtractCost = 0;
+  SmallBitVector IsIdentity;
+  SmallVector<unsigned> VF;
+  SmallVector<SmallVector<int>> ShuffleMask;
+  SmallVector<Value *> FirstUsers;
+  SmallVector<APInt> DemandedElts;
   for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
     if (!ExtractCostCalculated.insert(EU.Scalar).second)
@@ -4367,6 +4416,36 @@
     if (isa<FixedVectorType>(EU.Scalar->getType()))
       continue;
 
+    // If found user is an insertelement, do not calculate extract cost but try
+    // to detect it as a final shuffled/identity match.
+    if (EU.User && isa<InsertElementInst>(EU.User)) {
+      if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
+        Optional<unsigned> InsertIdx = getInsertIndex(EU.User, 0);
+        if (!InsertIdx)
+          continue;
+        int VecId = -1;
+        for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
+          if (AreFromSingleVector(EU.User, FirstUsers[I])) {
+            VecId = I;
+            break;
+          }
+        }
+        if (VecId < 0) {
+          VF.push_back(FTy->getNumElements());
+          ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
+          FirstUsers.push_back(EU.User);
+          DemandedElts.push_back(APInt::getNullValue(VF.back()));
+          IsIdentity.push_back(true);
+          VecId = FirstUsers.size() - 1;
+        }
+        int Idx = *InsertIdx;
+        ShuffleMask[VecId][Idx] = EU.Lane;
+        IsIdentity.set(IsIdentity.test(VecId) &
+                       (EU.Lane == Idx || EU.Lane == UndefMaskElem));
+        DemandedElts[VecId].setBit(Idx);
+      }
+    }
+
     // If we plan to rewrite the tree in a smaller type, we will need to sign
     // extend the extracted value back to the original type. Here, we account
     // for the extract and the added cost of the sign extend if needed.
@@ -4387,6 +4466,39 @@
 
   InstructionCost SpillCost = getSpillCost();
   Cost += SpillCost + ExtractCost;
+  for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
+    if (!IsIdentity.test(I)) {
+      InstructionCost C = TTI->getShuffleCost(
+          TTI::SK_PermuteSingleSrc,
+          cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]);
+      LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+                        << " for final shuffle of insertelement external users "
+                        << *VectorizableTree.front()->Scalars.front() << ".\n"
+                        << "SLP: Current total cost = " << Cost << "\n");
+      Cost += C;
+    }
+    unsigned VF = ShuffleMask[I].size();
+    for (int &Mask : ShuffleMask[I])
+      Mask = (Mask == UndefMaskElem ? 0 : VF) + Mask;
+    InstructionCost C = TTI->getShuffleCost(
+        TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()),
+        ShuffleMask[I]);
+    LLVM_DEBUG(
+        dbgs()
+        << "SLP: Adding cost " << C
+        << " for final shuffle of vector node and external insertelement users "
+        << *VectorizableTree.front()->Scalars.front() << ".\n"
+        << "SLP: Current total cost = " << Cost << "\n");
+    Cost += C;
+    InstructionCost InsertCost = TTI->getScalarizationOverhead(
+        cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
+        /*Insert*/ true,
+        /*Extract*/ false);
+    Cost -= InsertCost;
+    LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
+                      << " for insertelements gather.\n"
+                      << "SLP: Current total cost = " << Cost << "\n");
+  }
 
 #ifndef NDEBUG
   SmallString<256> Str;
@@ -4856,28 +4968,53 @@
       const unsigned NumScalars = E->Scalars.size();
 
       // Create InsertVector shuffle if necessary
-      if (NumElts != NumScalars) {
-        unsigned MinIndex = *getInsertIndex(E->Scalars[0], 0);
-        Instruction *FirstInsert = nullptr;
-        for (auto *Scalar : E->Scalars)
-          if (!FirstInsert &&
-              !is_contained(E->Scalars,
-                            cast<Instruction>(Scalar)->getOperand(0)))
-            FirstInsert = cast<Instruction>(Scalar);
-
-        // Create shuffle to resize vector
-        SmallVector<int, 16> Mask(NumElts, UndefMaskElem);
+      Instruction *FirstInsert = nullptr;
+      bool IsIdentity = true;
+      unsigned Offset = UINT_MAX;
+      for (unsigned I = 0; I < NumScalars; ++I) {
+        Value *Scalar = E->Scalars[I];
+        if (!FirstInsert &&
+            !is_contained(E->Scalars, cast<Instruction>(Scalar)->getOperand(0)))
+          FirstInsert = cast<Instruction>(Scalar);
+        Optional<unsigned> InsertIdx = getInsertIndex(Scalar, 0);
+        if (!InsertIdx)
+          continue;
+        unsigned Idx = *InsertIdx;
+        if (Idx < Offset) {
+          Offset = Idx;
+          IsIdentity &= I == 0;
+        } else {
+          IsIdentity &= Idx - Offset == I;
+        }
+      }
+
+      // Create shuffle to resize vector
+      SmallVector<int> Mask(NumElts, UndefMaskElem);
+      if (!IsIdentity) {
+        for (unsigned I = 0; I < NumScalars; ++I) {
+          Value *Scalar = E->Scalars[I];
+          Optional<unsigned> InsertIdx = getInsertIndex(Scalar, 0);
+          if (!InsertIdx)
+            continue;
+          Mask[*InsertIdx - Offset] = I;
+        }
+      } else {
         std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+      }
+      if (!IsIdentity || NumElts != NumScalars)
         V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask);
 
-        const unsigned MaxIndex = MinIndex + NumScalars;
-        for (unsigned I = 0; I < NumElts; I++)
-          Mask[I] =
-              (I < MinIndex || I >= MaxIndex) ? I : NumElts - MinIndex + I;
+      if (NumElts != NumScalars) {
+        SmallVector<int> InsertMask(NumElts);
+        std::iota(InsertMask.begin(), InsertMask.end(), 0);
+        for (unsigned I = 0; I < NumElts; I++) {
+          if (Mask[I] != UndefMaskElem)
+            InsertMask[Offset + I] = NumElts + I;
+        }
 
         V = Builder.CreateShuffleVector(
-            FirstInsert->getOperand(0), V, Mask,
-            cast<Instruction>(E->Scalars[NumScalars - 1])->getName());
+            FirstInsert->getOperand(0), V, InsertMask,
+            cast<Instruction>(E->Scalars.back())->getName());
       }
 
       ++NumVectorInstructions;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -40,31 +40,11 @@
 
 define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select2(
-; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
-; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
-; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
-; CHECK-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
-; CHECK-NEXT:    [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
-; CHECK-NEXT:    [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
-; CHECK-NEXT:    [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <8 x float> undef, float [[S0]], i32 0
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <8 x float> [[RA]], float [[S1]], i32 2
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <8 x float> [[RB]], float [[S2]], i32 4
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <8 x float> [[RC]], float [[S3]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[RD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 undef, i32 3>
+; CHECK-NEXT:    [[RD1:%.*]] = shufflevector <8 x float> undef, <8 x float> [[TMP3]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[RD1]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1