Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h
===================================================================
--- include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -94,6 +94,10 @@
   /// collected in GEPs.
   bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
+  /// \brief Restore inserts, loads out of dereferenceable_or_null attribute.
+  InsertElementInst * restoreInserts(InsertElementInst *FirstInsertElem,
+                                     LoadInst *LInstr, const MDNode *MD);
+
   /// Try to find horizontal reduction or otherwise vectorize a chain of binary
   /// operators.
   bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
Index: lib/IR/Verifier.cpp
===================================================================
--- lib/IR/Verifier.cpp
+++ lib/IR/Verifier.cpp
@@ -3714,8 +3714,6 @@
 }
 
 void Verifier::visitDereferenceableMetadata(Instruction& I, MDNode* MD) {
-  Assert(I.getType()->isPointerTy(), "dereferenceable, dereferenceable_or_null "
-         "apply only to pointer types", &I);
   Assert(isa<LoadInst>(I),
          "dereferenceable, dereferenceable_or_null apply only to load"
          " instructions, use attributes for calls or invokes", &I);
Index: lib/Transforms/InstCombine/InstructionCombining.cpp
===================================================================
--- lib/Transforms/InstCombine/InstructionCombining.cpp
+++ lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -57,6 +57,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
@@ -87,6 +88,10 @@
 MaxArraySize("instcombine-maxarray-size", cl::init(1024),
              cl::desc("Maximum array size considered when doing a combine"));
 
+DenseMap<Value *, Value *> Bases;
+
+DenseMap<Value *, std::pair<Value*, Value*>> GEPs;
+
 Value *InstCombiner::EmitGEPOffset(User *GEP) {
   return llvm::EmitGEPOffset(Builder, DL, GEP);
 }
@@ -1415,6 +1420,15 @@
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
 
+  if (!dyn_cast<VectorType>(Ops[0]->getType()) && GEP.getNumOperands() == 2)
+    if (Constant *CST = dyn_cast<Constant>(GEP.getOperand(1))) {
+       Value *PtrOp = GEP.getOperand(0);
+       Value *GEPResult = llvm::cast<llvm::Value>(&GEP);
+       if (CST > Bases[PtrOp])
+           Bases[PtrOp]=CST;
+       GEPs[GEPResult] = std::make_pair(PtrOp, CST);
+  }
+
   if (Value *V = SimplifyGEPInst(GEP.getSourceElementType(), Ops,
                                  SQ.getWithInstruction(&GEP)))
     return replaceInstUsesWith(GEP, V);
@@ -2839,6 +2853,42 @@
     // Check to see if we can DCE the instruction.
     if (isInstructionTriviallyDead(I, &TLI)) {
       DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
+      // If we have a load here then we need to
+      // update other loads with dereferenceable_or_null
+      // for the same base pointer.
+      if (LoadInst *Load = dyn_cast<LoadInst>(I)) {
+        BasicBlock *BB = Load->getParent();
+        for (BasicBlock::iterator Iter = BB->begin(),
+                            E = BB->end(); Iter != E; ) {
+          long long MaxOffset=0, CurPos=0;
+          if (LoadInst *LInstr = dyn_cast<LoadInst>(Iter++)) {
+           if (LInstr->getMetadata(
+                  LLVMContext::MD_dereferenceable_or_null) != nullptr)
+                 continue;
+           if (Bases[LInstr->getOperand(0)])
+              MaxOffset = (dyn_cast<ConstantInt>
+                     (Bases[LInstr->getOperand(0)]))->getZExtValue();
+             else
+           if (GEPs.count(LInstr->getOperand(0)) > 0) {
+             MaxOffset = (dyn_cast<ConstantInt>
+                  (Bases[GEPs[LInstr->getOperand(0)].first]))->getZExtValue();
+             CurPos = (dyn_cast<ConstantInt>(
+                         GEPs[LInstr->getOperand(0)].second))->getZExtValue();
+           }
+           if ((MaxOffset-CurPos) != 0)
+            {
+             Constant *C = ConstantInt::get(LInstr->getContext(),
+                    APInt(64, (MaxOffset - CurPos)
+                              *DL.getTypeStoreSize(LInstr->getType())));
+             SmallVector<Metadata*, 3> Vals;
+             MDBuilder MDB(LInstr->getContext());
+             Vals.push_back(MDB.createConstant(C));
+             LInstr->setMetadata(LLVMContext::MD_dereferenceable_or_null,
+                         MDNode::get(LInstr->getContext(), Vals));
+           }
+          }
+        }
+      }
       eraseInstFromFunction(*I);
       ++NumDeadInst;
       MadeIRChange = true;
@@ -3134,6 +3184,8 @@
                                 LoopInfo *LI = nullptr) {
   auto &DL = F.getParent()->getDataLayout();
   ExpensiveCombines |= EnableExpensiveCombines;
+  Bases.clear();
+  GEPs.clear();
 
   /// Builder - This is an IRBuilder that automatically inserts new
   /// instructions into the worklist when they are created.
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4836,6 +4836,138 @@
   return Res;
 }
 
+InsertElementInst* SLPVectorizerPass::restoreInserts(InsertElementInst*
+                                         FirstInsertElem, LoadInst *LInstr,
+                                         const MDNode *MD)
+{
+     Value *Ptr;
+     Value *Use = nullptr;
+     SmallVector<int64_t, 16> Offsets;
+     SmallDenseMap<int64_t, Value*> Inserts;
+     BasicBlock *BB = FirstInsertElem->getParent();
+     int64_t FirstAt = 0;
+     int64_t TypeSize = DL->getTypeStoreSize(LInstr->getType());
+     VectorType *VecType = FirstInsertElem->getType();
+
+     ConstantInt *C = cast<ConstantInt>(FirstInsertElem->getOperand(2));
+     if (C->getZExtValue() == 0) {
+       // We found the first InsertElem in
+       // this chain.
+       Inserts[0] = FirstInsertElem;
+       Offsets.push_back(0);
+     }
+     if (GetElementPtrInst *GEP =
+             dyn_cast<GetElementPtrInst>(LInstr->getOperand(0)))
+      {
+        Ptr = GEP->getOperand(0);
+        // Looks like the FirstInsertElem was not first in the chain,
+        // saving its offset in FirstAt.
+        if (ConstantInt *C = dyn_cast<ConstantInt>(GEP->getOperand(1)))
+         FirstAt = C->getZExtValue() * DL->getTypeStoreSize(LInstr->getType());
+          else
+         // Complex GEP for vector insert.
+         return nullptr;
+     } else
+        Ptr = LInstr->getOperand(0);
+     ConstantAsMetadata *ValMD =
+                 dyn_cast<ConstantAsMetadata>(MD->getOperand(0));
+     int64_t MaxOffset =
+            cast<ConstantInt>(ValMD->getValue())->getZExtValue() +
+                FirstAt;
+     IRBuilder<NoFolder> Builder(LInstr->getParent(),
+                                 ++BasicBlock::iterator(LInstr));
+     DEBUG(dbgs() << "Found maxmimum offset for " << *Ptr << " " <<
+             MaxOffset << "\n");
+     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++)
+       if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(it))
+        {
+         // Examining the chain of GEP, Load, Insert.
+         if (Ptr == GEP->getOperand(0)) {
+           if (ConstantInt *C = dyn_cast<ConstantInt>(GEP->getOperand(1)))
+            {
+              int64_t Off = C->getZExtValue()*TypeSize;
+              Offsets.push_back(Off);
+              if (GEP->use_empty())
+                   return nullptr;
+              if (!GEP->hasOneUse())
+                   return nullptr;
+              LoadInst *Load = dyn_cast<LoadInst>(GEP->user_back());
+              if (!Load)
+                   return nullptr;
+              if (Load->use_empty())
+                   return nullptr;
+              if (!Load->hasOneUse())
+                   return nullptr;
+              InsertElementInst *Insert =
+                       dyn_cast<InsertElementInst>(Load->user_back());
+              if (!Insert)
+                   return nullptr;
+              ConstantInt *C1 = cast<ConstantInt>(Insert->getOperand(2));
+              int64_t Off1 = C1->getZExtValue()*TypeSize;
+              // Offset from GEP must be equal to offset
+              // from Load otherwise ignoring.
+              if (Off != Off1)
+                       return nullptr;
+              Inserts[Off] = Insert;
+              if (!Insert->use_empty())
+                 Use = Insert->user_back();
+           } else // Complex GEP for vector insert.
+             return nullptr;
+         }
+       }
+
+     for (int i=0; i<MaxOffset+TypeSize; i+=TypeSize) {
+       // Walk through possible offsets.
+       bool Found=false;
+       for (unsigned j=0; j<Offsets.size(); j++)
+         if (Offsets[j]==i) {
+          // We found alrerady existing InsertElement
+          // for particular offset.
+          InsertElementInst *Ins = dyn_cast<InsertElementInst>(Inserts[i]);
+          assert(Ins != nullptr && "Expect insert element instruction here");
+          // We don't need to update InsertElement for
+          // offset quals to 0, it should be Undef.
+          if (i != 0 && (Ins->getOperand(0) != Inserts[i-TypeSize]))
+                               Ins->setOperand(0, Inserts[i-TypeSize]);
+          Found=true;
+         }
+       if (!Found) {
+         // Building GEP, Load, Insert for
+         // this Offset.
+         Builder.SetInsertPoint(LInstr);
+         Builder.SetCurrentDebugLocation(LInstr->getDebugLoc());
+         Value *NewGEP = Builder.CreateGEP(LInstr->getType(), Ptr,
+                     Builder.getInt64(i/TypeSize));
+         Value *NewLoad = Builder.CreateLoad(NewGEP);
+         if (i==0) {
+            // For Offset 0 it should be Undef.
+            Inserts[0] = Builder.CreateInsertElement(
+                 UndefValue::get(VecType), NewLoad, Builder.getInt64(0));
+            FirstInsertElem = dyn_cast<InsertElementInst>(Inserts[0]);
+           } else {
+            // InsertElement must be inserted just after
+            // previous InsertElement.
+            InsertElementInst *PrevIns = dyn_cast<InsertElementInst>(
+                               Inserts[i-TypeSize]);
+            Instruction *InsAt = PrevIns->getNextNode();
+            IRBuilder<NoFolder> Builder1(InsAt->getParent(),
+                    ++BasicBlock::iterator(InsAt));
+            Builder1.SetInsertPoint(InsAt);
+            Builder1.SetCurrentDebugLocation(InsAt->getDebugLoc());
+            Inserts[i] = Builder1.CreateInsertElement(
+              PrevIns, NewLoad, Builder1.getInt64(i/TypeSize));
+        }
+      }
+     }
+     if (ShuffleVectorInst* Shuffle = dyn_cast<ShuffleVectorInst>(Use))
+      {
+        // Update ShuffleVector with restored insert element.
+        Shuffle->setOperand(0, Inserts[MaxOffset]);
+        return FirstInsertElem;
+     }
+    return nullptr;
+}
+
 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
                                                  BasicBlock *BB, BoUpSLP &R,
                                                  TargetTransformInfo *TTI) {
@@ -4995,6 +5127,16 @@
       if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
         continue;
 
+      if (LoadInst *LInstr = dyn_cast<LoadInst>(
+                                          FirstInsertElem->getOperand(1)))
+       if (const MDNode *MD = LInstr->getMetadata(
+                                     LLVMContext::MD_dereferenceable_or_null))
+        {
+         if (InsertElementInst *Insert =
+                                restoreInserts(FirstInsertElem, LInstr, MD))
+           findBuildVector(Insert, BuildVector, BuildVectorOpds);
+      }
+
       // Vectorize starting with the build vector operands ignoring the
       // BuildVector instructions for the purpose of scheduling and user
       // extraction.
Index: test/Transforms/SLPVectorizer/X86/21780.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/21780.ll
@@ -0,0 +1,97 @@
+; RUN: opt < %s -instcombine -slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -S | FileCheck %s --check-prefix=SLP
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -S | FileCheck %s --check-prefix=ICOMB
+
+; SLP-LABEL: load_four_scalars_but_use_two
+; SLP:  %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2
+; SLP-NEXT:  %1 = getelementptr double, double* %ptr, i64 1
+; SLP-NEXT:  %2 = getelementptr double, double* %ptr, i64 3
+; SLP-NEXT:  %3 = bitcast double* %ptr to <4 x double>*
+; SLP-NEXT:  %4 = load <4 x double>, <4 x double>* %3, align 8
+; SLP-NEXT:  %5 = extractelement <4 x double> %4, i32 0
+; SLP-NEXT:  %ins0 = insertelement <4 x double> undef, double %5, i32 0
+; SLP-NEXT:  %6 = extractelement <4 x double> %4, i32 1
+; SLP-NEXT:  %7 = insertelement <4 x double> %ins0, double %6, i64 1
+; SLP-NEXT:  %8 = extractelement <4 x double> %4, i32 2
+; SLP-NEXT:  %ins2 = insertelement <4 x double> %7, double %8, i32 2
+; SLP-NEXT:  %9 = extractelement <4 x double> %4, i32 3
+; SLP-NEXT:  %10 = insertelement <4 x double> %ins2, double %9, i64 3
+; SLP-NEXT:  %shuffle = shufflevector <4 x double> %10, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; SLP-NEXT:  ret <4 x double> %shuffle
+
+; ICOMB-LABEL: load_four_scalars_but_use_two 
+; ICOMB:  %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2
+; ICOMB-NEXT:  %ld0 = load double, double* %ptr, align 8, !dereferenceable_or_null !0
+; ICOMB-NEXT:  %ld2 = load double, double* %arrayidx2, align 8, !dereferenceable_or_null !1
+; ICOMB-NEXT:  %ins0 = insertelement <4 x double> undef, double %ld0, i32 0
+; ICOMB-NEXT:  %ins2 = insertelement <4 x double> %ins0, double %ld2, i32 2
+; ICOMB-NEXT:  %shuffle = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; ICOMB-NEXT:  ret <4 x double> %shuffle
+
+define <4 x double> @load_four_scalars_but_use_two(double* %ptr) {
+  %arrayidx0 = getelementptr inbounds double, double* %ptr, i64 0
+  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1
+  %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2
+  %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3
+
+  %ld0 = load double, double* %arrayidx0
+  %ld1 = load double, double* %arrayidx1
+  %ld2 = load double, double* %arrayidx2
+  %ld3 = load double, double* %arrayidx3
+
+  %ins0 = insertelement <4 x double> undef, double %ld0, i32 0
+  %ins1 = insertelement <4 x double> %ins0, double %ld1, i32 1
+  %ins2 = insertelement <4 x double> %ins1, double %ld2, i32 2
+  %ins3 = insertelement <4 x double> %ins2, double %ld3, i32 3
+
+  %shuffle = shufflevector <4 x double> %ins3, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+; SLP-LABEL: load_four_scalars_but_use_two1
+; SLP:  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1
+; SLP-NEXT:  %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2
+; SLP-NEXT:  %1 = getelementptr double, double* %ptr, i64 0
+; SLP-NEXT:  %2 = getelementptr double, double* %ptr, i64 3
+; SLP-NEXT:  %3 = bitcast double* %1 to <4 x double>*
+; SLP-NEXT:  %4 = load <4 x double>, <4 x double>* %3, align 8
+; SLP-NEXT:  %5 = extractelement <4 x double> %4, i32 0
+; SLP-NEXT:  %6 = insertelement <4 x double> undef, double %5, i64 0
+; SLP-NEXT:  %7 = extractelement <4 x double> %4, i32 1
+; SLP-NEXT:  %ins1 = insertelement <4 x double> %6, double %7, i32 1
+; SLP-NEXT:  %8 = extractelement <4 x double> %4, i32 2
+; SLP-NEXT:  %ins2 = insertelement <4 x double> %ins1, double %8, i32 2
+; SLP-NEXT:  %9 = extractelement <4 x double> %4, i32 3
+; SLP-NEXT:  %10 = insertelement <4 x double> %ins2, double %9, i64 3
+; SLP-NEXT:  %shuffle = shufflevector <4 x double> %10, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; SLP-NEXT:  ret <4 x double> %shuffle
+
+; ICOMB-LABEL: load_four_scalars_but_use_two1
+; ICOMB:  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1
+; ICOMB-NEXT:  %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2
+; ICOMB-NEXT:  %ld1 = load double, double* %arrayidx1, align 8, !dereferenceable_or_null !2
+; ICOMB-NEXT:  %ld2 = load double, double* %arrayidx2, align 8, !dereferenceable_or_null !1
+; ICOMB-NEXT:  %ins1 = insertelement <4 x double> undef, double %ld1, i32 1
+; ICOMB-NEXT:  %ins2 = insertelement <4 x double> %ins1, double %ld2, i32 2
+; ICOMB-NEXT:  %shuffle = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; ICOMB-NEXT:  ret <4 x double> %shuffle
+
+define <4 x double> @load_four_scalars_but_use_two1(double* %ptr) {
+  %arrayidx0 = getelementptr inbounds double, double* %ptr, i64 0
+  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1
+  %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2
+  %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3
+  
+  %ld0 = load double, double* %arrayidx0
+  %ld1 = load double, double* %arrayidx1
+  %ld2 = load double, double* %arrayidx2
+  %ld3 = load double, double* %arrayidx3
+
+  %ins0 = insertelement <4 x double> undef, double %ld0, i32 0
+  %ins1 = insertelement <4 x double> %ins0, double %ld1, i32 1
+  %ins2 = insertelement <4 x double> %ins1, double %ld2, i32 2
+  %ins3 = insertelement <4 x double> %ins2, double %ld3, i32 3
+
+  %shuffle = shufflevector <4 x double> %ins3, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
Index: test/Verifier/dereferenceable-md.ll
===================================================================
--- test/Verifier/dereferenceable-md.ll
+++ test/Verifier/dereferenceable-md.ll
@@ -18,22 +18,6 @@
 ; CHECK: dereferenceable, dereferenceable_or_null apply only to load instructions, use attributes for calls or invokes
 ; CHECK-NEXT: call i8* @foo()
 
-define i8 @f3(i8* %x) {
-entry:
-  %y = load i8, i8* %x, !dereferenceable !{i64 2}
-  ret i8 %y
-}
-; CHECK: dereferenceable, dereferenceable_or_null apply only to pointer types
-; CHECK-NEXT: load i8, i8* %x
-
-define i8 @f4(i8* %x) {
-entry:
-  %y = load i8, i8* %x, !dereferenceable_or_null !{i64 2}
-  ret i8 %y
-}
-; CHECK: dereferenceable, dereferenceable_or_null apply only to pointer types
-; CHECK-NEXT: load i8, i8* %x
-
 define i8* @f5(i8** %x) {
 entry:
   %y = load i8*, i8** %x, !dereferenceable !{}
@@ -83,4 +67,4 @@
   ret i8* %y
 }
 ; CHECK: dereferenceable, dereferenceable_or_null metadata value must be an i64!
-; CHECK-NEXT: load i8*, i8** %x
\ No newline at end of file
+; CHECK-NEXT: load i8*, i8** %x