Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -488,6 +488,10 @@
   /// any callee-saved registers, so would require a spill and fill.
   unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const;
 
+  /// \return whether the target supports using memory operand as the
+  /// destination for the opcode and type.
+  bool isLegalMemDestOperand(unsigned Opcode, Type *Ty) const;
+
   /// \returns True if the intrinsic is a supported memory intrinsic.  Info
   /// will contain additional information - whether the intrinsic may write
   /// or read to memory, volatility and the pointer.  Info is undefined
@@ -591,6 +595,7 @@
   virtual unsigned getNumberOfParts(Type *Tp) = 0;
   virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) = 0;
   virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) = 0;
+  virtual bool isLegalMemDestOperand(unsigned Opcode, Type *Ty) = 0;
   virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
                                   MemIntrinsicInfo &Info) = 0;
   virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
@@ -761,6 +766,9 @@
   unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) override {
     return Impl.getCostOfKeepingLiveOverCall(Tys);
   }
+  bool isLegalMemDestOperand(unsigned Opcode, Type *Ty) override {
+    return Impl.isLegalMemDestOperand(Opcode, Ty);
+  }
   bool getTgtMemIntrinsic(IntrinsicInst *Inst,
                           MemIntrinsicInfo &Info) override {
     return Impl.getTgtMemIntrinsic(Inst, Info);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -317,6 +317,8 @@
 
   unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { return 0; }
 
+  bool isLegalMemDestOperand(unsigned, Type *) { return 0; }
+
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) {
     return false;
   }
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -265,6 +265,11 @@
   return TTIImpl->getCostOfKeepingLiveOverCall(Tys);
 }
 
+bool TargetTransformInfo::isLegalMemDestOperand(unsigned Opcode,
+                                                Type *Ty) const {
+  return TTIImpl->isLegalMemDestOperand(Opcode, Ty);
+}
+
 bool TargetTransformInfo::getTgtMemIntrinsic(IntrinsicInst *Inst,
                                              MemIntrinsicInfo &Info) const {
   return TTIImpl->getTgtMemIntrinsic(Inst, Info);
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -101,9 +101,9 @@
                          Type *Ty);
   unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                          Type *Ty);
+  bool isLegalMemDestOperand(unsigned Opcode, Type *Ty);
   bool isLegalMaskedLoad(Type *DataType, int Consecutive);
   bool isLegalMaskedStore(Type *DataType, int Consecutive);
-
   /// @}
 };
 
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -66,6 +66,26 @@
 
 }
 
+bool X86TTIImpl::isLegalMemDestOperand(unsigned Opcode, Type *Ty) {
+  if (Ty->isVectorTy())
+    return false;
+  switch (Opcode) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return true;
+  }
+  return false;
+}
+
 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // If the loop will not be vectorized, don't interleave the loop.
   // Let regular unroll to unroll the loop, which saves the overflow
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -359,6 +359,10 @@
   /// holding live values over call sites.
   int getSpillCost();
 
+  /// \returns the cost incurred by other side effect like failing to
+  /// combine insns after vectorization.
+  int getOtherCost();
+
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
   int getTreeCost();
@@ -1710,6 +1714,41 @@
   return Cost;
 }
 
+int BoUpSLP::getOtherCost() {
+  // Many X86 scalar instructions support using memory operand as destination
+  // but most vector instructions do not support it. Like:
+  //   shrq $5, (%rdx)
+  //   shrq $5, 8(%rdx)
+  // is often better than:
+  //   movdqu (%rdx), %xmm0
+  //   psrlq  $5, %xmm0
+  //   movdqu  %xmm0, (%rdx)
+  if (VectorizableTree.size() >= 3) {
+    StoreInst *SI = dyn_cast<StoreInst>(VectorizableTree[0].Scalars[0]);
+    if (!SI)
+      return 0;
+
+    LoadInst *LI = dyn_cast<LoadInst>(VectorizableTree[2].Scalars[0]);
+    if (!LI)
+      return 0;
+
+    Instruction *IT = cast<Instruction>(VectorizableTree[1].Scalars[0]);
+    ArrayRef<Value *> VL = VectorizableTree[0].Scalars;
+    VectorType *VecTy = VectorType::get(IT->getType(), VL.size());
+
+    // If scalar version of IT instruction cannot use memory operand as
+    // destination or vector version can, no extra cost.
+    // If LI and SI have different memory addresses, no extra cost.
+    if (!TTI->isLegalMemDestOperand(IT->getOpcode(), IT->getType()) ||
+        TTI->isLegalMemDestOperand(IT->getOpcode(), VecTy) ||
+        SI->getOperand(1) != LI->getOperand(0))
+      return 0;
+
+    return VL.size() * 2;
+  }
+  return 0;
+}
+
 int BoUpSLP::getTreeCost() {
   int Cost = 0;
   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
@@ -1753,6 +1792,8 @@
 
   Cost += getSpillCost();
 
+  Cost += getOtherCost();
+
   DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
   return  Cost + ExtractCost;
 }
Index: test/Transforms/SLPVectorizer/X86/pr23510.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/pr23510.ll
+++ test/Transforms/SLPVectorizer/X86/pr23510.ll
@@ -0,0 +1,36 @@
+; PR23510
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -basicaa -slp-vectorizer -S | FileCheck %s
+; Check that slp does not generate vectorized lshr.
+; CHECK-LABEL: @foo(
+; CHECK-NOT: lshr <2 x i64>
+
+define void @foo(float* nocapture readonly %p1, i32 %p2, i64* nocapture %p3, float* nocapture %p4) {
+entry:
+  %idx.ext = sext i32 %p2 to i64
+  %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
+  %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
+  %tmp = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3
+  %tmp1 = load float, float* %arrayidx2, align 4
+  %add = fadd float %tmp, %tmp1
+  store float %add, float* %arrayidx2, align 4
+  store i64 0, i64* %p3, align 8
+  %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1
+  %tmp2 = load i64, i64* %arrayidx4, align 8
+  %shr5 = lshr i64 %tmp2, 5
+  store i64 %shr5, i64* %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2
+  %tmp3 = load i64, i64* %arrayidx6, align 8
+  %shr7 = lshr i64 %tmp3, 5
+  store i64 %shr7, i64* %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3
+  %tmp4 = load i64, i64* %arrayidx8, align 8
+  %shr9 = lshr i64 %tmp4, 5
+  store i64 %shr9, i64* %arrayidx8, align 8
+  %add.ptr11 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext
+  %tmp5 = load float, float* %add.ptr11, align 4
+  %tmp6 = load float, float* %p4, align 4
+  %add15 = fadd float %tmp5, %tmp6
+  store float %add15, float* %p4, align 4
+  ret void
+}