Index: include/llvm/IR/Intrinsics.td
===================================================================
--- include/llvm/IR/Intrinsics.td
+++ include/llvm/IR/Intrinsics.td
@@ -955,6 +955,11 @@
 
 def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
                              [IntrNoMem, Returned<0>]>;
+
+def int_phantom_mem : Intrinsic<[],
+                            [llvm_anyptr_ty, llvm_anyptr_ty, llvm_i64_ty],
+                            [IntrArgMemOnly]>;
+
 //===----------------------------------------------------------------------===//
 // Target-specific intrinsics
 //===----------------------------------------------------------------------===//
Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h
===================================================================
--- include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -112,6 +112,10 @@
   /// collected in GEPs.
   bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
+  /// \brief Restore inserts, loads out of phantom_mem intrinsic.
+  InsertElementInst * restoreInserts(InsertElementInst *FirstInsertElem,
+                                     LoadInst *LInstr, Value* Ptr);
+
   /// Try to find horizontal reduction or otherwise vectorize a chain of binary
   /// operators.
   bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
@@ -148,6 +152,8 @@
 
   /// The getelementptr instructions in a basic block organized by base pointer.
   WeakTrackingVHListMap GEPs;
+
+  DenseMap<Value*, uint64_t> PhantomMem;
 };
 
 } // end namespace llvm
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4198,6 +4198,21 @@
   // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
   // delete instructions.
 
+  for (auto &BB : F)
+    for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;) {
+      Instruction *I = &*BBI++;
+
+      CallInst *CI = dyn_cast<CallInst>(I);
+      if (!CI)
+        continue;
+      if (getIntrinsicForCallSite(CI, TLI) == Intrinsic::phantom_mem) {
+        ConstantInt *MaxOffset = cast<ConstantInt>(CI->getOperand(2));
+        PhantomMem[CI->getOperand(0)] = MaxOffset->getZExtValue();
+        CI->removeFromParent();
+        CI->dropAllReferences();
+      }
+    }
+
   // Scan the blocks in the function in post order.
   for (auto BB : post_order(&F.getEntryBlock())) {
     collectSeedInstructions(BB);
@@ -5327,6 +5342,130 @@
   return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false);
 }
 
+InsertElementInst *
+SLPVectorizerPass::restoreInserts(InsertElementInst *FirstInsertElem,
+                                  LoadInst *LInstr, Value *Ptr) {
+  ShuffleVectorInst *Use = nullptr;
+  DenseMap<uint64_t, LoadInst *> Loads;
+  DenseMap<uint64_t, InsertElementInst *> Inserts;
+  DenseMap<uint64_t, GetElementPtrInst *> GEPs;
+  BasicBlock *BB = FirstInsertElem->getParent();
+  VectorType *VecType = FirstInsertElem->getType();
+  uint64_t MaxOffset = PhantomMem[Ptr];
+
+  ConstantInt *C = dyn_cast<ConstantInt>(FirstInsertElem->getOperand(2));
+
+  if (!C || C->getZExtValue() > MaxOffset)
+    return nullptr;
+
+  // We found the first InsertElem in
+  // this chain.
+  uint64_t Offset = C->getZExtValue();
+  Inserts[Offset] = FirstInsertElem;
+  Loads[Offset] = LInstr;
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LInstr->getOperand(0));
+  if (GEP)
+    GEPs[Offset] = GEP;
+
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
+    if (InsertElementInst *Insert = dyn_cast<InsertElementInst>(it)) {
+      ConstantInt *C = dyn_cast<ConstantInt>(Insert->getOperand(2));
+      LoadInst *Load = dyn_cast<LoadInst>(Insert->getOperand(1));
+      if (!Load || Insert == FirstInsertElem)
+        continue;
+      GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Load->getOperand(0));
+      Value *Base = Load->getOperand(0);
+      if (Base == Ptr) {
+        if (!C)
+          return nullptr;
+        uint64_t Offset = C->getZExtValue();
+        if (Offset > MaxOffset || !Load->hasOneUse())
+          return nullptr;
+        Inserts[Offset] = Insert;
+        Loads[Offset] = Load;
+        // GEP instruction might be missing for some offsets.
+        if (GEP)
+          GEPs[Offset] = GEP;
+        continue;
+      }
+    }
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(it))
+      // Examining the chain of GEP, Load, Insert.
+      if (Ptr == GEP->getOperand(0)) {
+        ConstantInt *C = dyn_cast<ConstantInt>(GEP->getOperand(1));
+        if (!C || GEP->use_empty() || !GEP->hasOneUse())
+          return nullptr;
+        uint64_t Off = C->getZExtValue();
+        LoadInst *Load = dyn_cast<LoadInst>(GEP->user_back());
+        if (!Load || Load->use_empty() || !Load->hasOneUse())
+          return nullptr;
+        InsertElementInst *Insert =
+            dyn_cast<InsertElementInst>(Load->user_back());
+        if (!Insert)
+          return nullptr;
+        ConstantInt *C1 = cast<ConstantInt>(Insert->getOperand(2));
+        uint64_t Off_Ins = C1->getZExtValue();
+        // Offset from GEP must be equal to offset from
+        // Insert otherwise we have to ignore.
+        if (Off != Off_Ins)
+          return nullptr;
+        if (!Insert->use_empty())
+          if (ShuffleVectorInst *Shuffle =
+                  dyn_cast<ShuffleVectorInst>(Insert->user_back()))
+            Use = Shuffle;
+        Inserts[Off] = Insert;
+        Loads[Off] = Load;
+        GEPs[Off] = GEP;
+      }
+  }
+  if (!Use || Use->getParent() != BB)
+    return nullptr;
+
+  IRBuilder<> Builder(LInstr->getParent(), ++BasicBlock::iterator(LInstr));
+  if (!Inserts[0]) {
+    Builder.SetInsertPoint(LInstr->getPrevNode());
+    LoadInst *NewLoad = Builder.CreateLoad(Ptr);
+    NewLoad->setAlignment(LInstr->getAlignment());
+    Loads[0] = NewLoad;
+    Value *NewInsert = Builder.CreateInsertElement(
+        UndefValue::get(VecType), NewLoad, Builder.getInt32(0));
+    Inserts[0] = cast<InsertElementInst>(NewInsert);
+  }
+  for (unsigned i = 1, e = MaxOffset + 1; i < e; i++) {
+    // Building GEP, Load, Insert for
+    // this Offset.
+    GetElementPtrInst *PrevGEP = GEPs[i - 1];
+    if (!Inserts[i]) {
+      assert(Loads[i - 1] != nullptr &&
+             "Couldn't find previous load in the chain.");
+      if (PrevGEP == nullptr)
+        Builder.SetInsertPoint(BB, ++Loads[i - 1]->getIterator());
+      else 
+        Builder.SetInsertPoint(BB, ++PrevGEP->getIterator());
+      Value *NewGEP =
+          Builder.CreateGEP(LInstr->getType(), Ptr, Builder.getInt64(i));
+      GEPs[i] = cast<GetElementPtrInst>(NewGEP);
+      Builder.SetInsertPoint(BB, ++Loads[i - 1]->getIterator());
+      Builder.SetCurrentDebugLocation(LInstr->getDebugLoc());
+      LoadInst *NewLoad = Builder.CreateLoad(NewGEP);
+      NewLoad->setAlignment(LInstr->getAlignment());
+      Loads[i] = NewLoad;
+      // InsertElement must be inserted just after
+      // previous InsertElement.
+      InsertElementInst *PrevIns = Inserts[i - 1];
+      Builder.SetInsertPoint(BB, ++PrevIns->getIterator());
+      Value *NewInsert =
+          Builder.CreateInsertElement(PrevIns, NewLoad, Builder.getInt32(i));
+      Inserts[i] = cast<InsertElementInst>(NewInsert);
+    }
+    Inserts[i]->setOperand(0, Inserts[i - 1]);
+  }
+
+  // Update ShuffleVector with restored insert element.
+  Use->setOperand(0, Inserts[MaxOffset]);
+  return cast<InsertElementInst>(Inserts[MaxOffset]);
+}
+
 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                    BasicBlock *BB, BoUpSLP &R) {
   SmallVector<Value *, 16> BuildVector;
@@ -5334,6 +5473,25 @@
   if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
     return false;
 
+  if (LoadInst *LInstr = dyn_cast<LoadInst>(IEI->getOperand(1))) {
+    Value *Ptr = nullptr;
+    if (PhantomMem.find(LInstr->getOperand(0)) != PhantomMem.end())
+      Ptr = LInstr->getOperand(0);
+    else if (GetElementPtrInst *GEP =
+                 dyn_cast<GetElementPtrInst>(LInstr->getOperand(0)))
+      if (PhantomMem.find(GEP->getOperand(0)) != PhantomMem.end())
+        Ptr = GEP->getOperand(0);
+
+    if (!Ptr)
+      return false;
+    if (InsertElementInst *Insert = restoreInserts(IEI, LInstr, Ptr)) {
+      BuildVector.clear();
+      BuildVectorOpds.clear();
+      if (!findBuildVector(Insert, BuildVector, BuildVectorOpds))
+        return false;
+    }
+  }
+
   // Vectorize starting with the build vector operands ignoring the BuildVector
   // instructions for the purpose of scheduling and user extraction.
   return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
Index: test/Transforms/SLPVectorizer/X86/pr21780.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/pr21780.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -S | FileCheck %s
+
+define <4 x double> @load_four_scalars_but_use_two(double* %ptr) #0 {
+; CHECK-LABEL: @load_four_scalars_but_use_two(
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, double* [[PTR]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[PTR]] to <4 x double>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    [[INS0:%.*]] = insertelement <4 x double> undef, double [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[INS0]], double [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x double> [[INS2]], double [[TMP9]], i32 3
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x double> [[SHUFFLE]]
+;
+  call void @llvm.phantom.mem.p0f64.p0f64(double* %ptr, double* null, i64 3)
+  %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2
+  %ld0 = load double, double* %ptr, align 8
+  %ld2 = load double, double* %arrayidx2, align 8
+  %ins0 = insertelement <4 x double> undef, double %ld0, i32 0
+  %ins2 = insertelement <4 x double> %ins0, double %ld2, i32 2
+  %shuffle = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @load_four_scalars_but_use_two1(double* %ptr) #0 {
+; CHECK-LABEL: @load_four_scalars_but_use_two1(
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[PTR]] to <4 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x double> [[INS1]], double [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[INS2]], double [[TMP8]], i32 3
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x double> [[SHUFFLE]]
+;
+  call void @llvm.phantom.mem.p0f64.p0f64(double* %ptr, double* null, i64 3)
+  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1
+  %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2
+  %ld1 = load double, double* %arrayidx1, align 8
+  %ld2 = load double, double* %arrayidx2, align 8
+  %ins1 = insertelement <4 x double> undef, double %ld1, i32 1
+  %ins2 = insertelement <4 x double> %ins1, double %ld2, i32 2
+  %shuffle = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.phantom.mem.p0f64.p0f64(double*, double*, i64) #1