Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -149,6 +149,20 @@
   return true;
 }
 
+static unsigned checkAddSubInst(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  unsigned Opcode = I0->getOpcode();
+  if (Opcode != Instruction::FAdd)
+    return 0;
+  for (int i = 1, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I ||
+        I->getOpcode() != ((i & 1) ? Instruction::FSub : Instruction::FAdd))
+      return 0;
+  }
+  return Instruction::ShuffleVector;
+}
+
 /// \returns The opcode if all of the Instructions in \p VL have the same
 /// opcode, or zero.
 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
@@ -158,8 +172,11 @@
   unsigned Opcode = I0->getOpcode();
   for (int i = 1, e = VL.size(); i < e; i++) {
     Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (!I || Opcode != I->getOpcode())
+    if (!I || Opcode != I->getOpcode()) {
+      if (Opcode == Instruction::FAdd && i == 1)
+        return checkAddSubInst(VL);
       return 0;
+    }
   }
   return Opcode;
 }
@@ -377,6 +394,7 @@
 
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
+
 private:
   struct TreeEntry;
 
@@ -471,7 +489,9 @@
     if (Vectorized) {
       Last->LastScalarIndex = getLastIndex(VL);
       for (int i = 0, e = VL.size(); i != e; ++i) {
-        assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
+        assert(!(getSameOpcode(VL) != Instruction::ShuffleVector &&
+                 ScalarToTreeEntry.count(VL[i])) &&
+               "Scalar already in tree!");
         ScalarToTreeEntry[VL[i]] = idx;
       }
     } else {
@@ -594,6 +614,7 @@
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   bool SameTy = getSameType(VL); (void)SameTy;
+  bool isAltShuffle = false;
   assert(SameTy && "Invalid types!");
 
   if (Depth == RecursionMaxDepth) {
@@ -615,10 +636,19 @@
       newTreeEntry(VL, false);
       return;
     }
+  unsigned Opcode = getSameOpcode(VL);
+
+  // Check that this shuffle vector refers to the alternate
+  // sequence of opcodes.
+  if (Opcode == Instruction::ShuffleVector) {
+    Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+    unsigned Op = I0->getOpcode();
+    if (Op != Instruction::ShuffleVector)
+      isAltShuffle = true;
+  }
 
   // If all of the operands are identical or constant we have a simple solution.
-  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
-      !getSameOpcode(VL)) {
+  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
     newTreeEntry(VL, false);
     return;
@@ -732,7 +762,7 @@
   // Check that every instructions appears once in this bundle.
   for (unsigned i = 0, e = VL.size(); i < e; ++i)
     for (unsigned j = i+1; j < e; ++j)
-      if (VL[i] == VL[j]) {
+      if ((VL[i] == VL[j]) && (Opcode != Instruction::ShuffleVector)) {
         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
         newTreeEntry(VL, false);
         return;
@@ -754,8 +784,6 @@
 
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
-  unsigned Opcode = getSameOpcode(VL);
-
   // Check if it is safe to sink the loads or the stores.
   if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
     Instruction *Last = getLastInstruction(VL);
@@ -987,6 +1015,26 @@
       }
       return;
     }
+    case Instruction::ShuffleVector: {
+      // If this is not an alternate sequence of opcode like add-sub
+      // then do not vectorize this instruction.
+      if (!isAltShuffle) {
+        newTreeEntry(VL, false);
+        DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+        return;
+      }
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1);
+      }
+      return;
+    }
     default:
       newTreeEntry(VL, false);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
@@ -1010,11 +1058,9 @@
     }
     return getGatherCost(E->Scalars);
   }
-
-  assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
-         "Invalid VL");
+  unsigned Opcode = getSameOpcode(VL);
+  assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
   Instruction *VL0 = cast<Instruction>(VL[0]);
-  unsigned Opcode = VL0->getOpcode();
   switch (Opcode) {
     case Instruction::PHI: {
       return 0;
@@ -1158,6 +1204,33 @@
 
       return VecCallCost - ScalarCallCost;
     }
+    case Instruction::ShuffleVector: {
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_AnyValue;
+      int ScalarCost = 0;
+      int VecCost = 0;
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        Instruction *I = cast<Instruction>(VL[i]);
+        if (!I)
+          break;
+        ScalarCost +=
+            TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+      }
+      // VecCost is equal to sum of the cost of creating 2 vectors
+      // and the cost of creating shuffle.
+      Instruction *I0 = cast<Instruction>(VL[0]);
+      VecCost =
+          TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
+      Instruction *I1 = cast<Instruction>(VL[1]);
+      VecCost +=
+          TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
+
+      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+      VecCost += TTI->getCastInstrCost(Opcode, VecTy, MaskTy);
+      return VecCost - ScalarCost;
+    }
     default:
       llvm_unreachable("Unknown instruction");
   }
@@ -1438,9 +1511,7 @@
     setInsertPointAfterBundle(E->Scalars);
     return Gather(E->Scalars, VecTy);
   }
-
-  unsigned Opcode = VL0->getOpcode();
-  assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
+  unsigned Opcode = getSameOpcode(E->Scalars);
 
   switch (Opcode) {
     case Instruction::PHI: {
@@ -1673,6 +1744,49 @@
       E->VectorizedValue = V;
       return V;
     }
+    case Instruction::ShuffleVector: {
+      ValueList LHSVL, RHSVL;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+        LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+        RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+      }
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      // Create a vector of LHS op1 RHS
+      BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
+      Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
+
+      // Create a vector of LHS op2 RHS
+      Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
+      BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
+      Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
+
+      // Create appropriate shuffle to take alternative operations from
+      // the vector.
+      std::vector<Constant *> Mask(E->Scalars.size());
+      unsigned e = E->Scalars.size();
+      for (unsigned i = 0; i < e; ++i) {
+        if (i & 1)
+          Mask[i] = Builder.getInt32(e + i);
+        else
+          Mask[i] = Builder.getInt32(i);
+      }
+
+      Value *ShuffleMask = ConstantVector::get(Mask);
+
+      Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+      E->VectorizedValue = V;
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
+      return V;
+    }
     default:
     llvm_unreachable("unknown inst");
   }
@@ -1737,11 +1851,15 @@
   // For each vectorized value:
   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
     TreeEntry *Entry = &VectorizableTree[EIdx];
+    SmallDenseMap<Value *, int> ScalarEntry;
 
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
-
+      // If this Scalar was already handled then continue
+      if (ScalarEntry.count(Scalar))
+        continue;
+      ScalarEntry[Scalar] = Lane;
       // No need to handle users of gathered values.
       if (Entry->NeedToGather)
         continue;
@@ -1925,7 +2043,6 @@
     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
          e = po_end(&F.getEntryBlock()); it != e; ++it) {
       BasicBlock *BB = *it;
-
       // Vectorize trees that end at stores.
       if (unsigned count = collectStores(BB, R)) {
         (void)count;
Index: test/Transforms/SLPVectorizer/X86/addsub.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/addsub.ll
+++ test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = common global [4 x float] zeroinitializer, align 16
+@b = common global [4 x float] zeroinitializer, align 16
+@c = common global [4 x float] zeroinitializer, align 16
+
+; CHECK-LABEL: @addsub
+; CHECK: fadd <4 x float>
+; CHECK: fsub <4 x float>
+; CHECK: shufflevector
+; Function Attrs: nounwind uwtable
+define void @addsub() #0 {
+  %1 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 0), align 16, !tbaa !1
+  %2 = load float* getelementptr inbounds ([4 x float]* @b, i64 0, i64 0), align 16, !tbaa !1
+  %3 = fadd float %1, %2
+  store float %3, float* getelementptr inbounds ([4 x float]* @c, i64 0, i64 0), align 16, !tbaa !1
+  %4 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 1), align 4, !tbaa !1
+  %5 = load float* getelementptr inbounds ([4 x float]* @b, i64 0, i64 1), align 4, !tbaa !1
+  %6 = fsub float %4, %5
+  store float %6, float* getelementptr inbounds ([4 x float]* @c, i64 0, i64 1), align 4, !tbaa !1
+  %7 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 2), align 8, !tbaa !1
+  %8 = load float* getelementptr inbounds ([4 x float]* @b, i64 0, i64 2), align 8, !tbaa !1
+  %9 = fadd float %7, %8
+  store float %9, float* getelementptr inbounds ([4 x float]* @c, i64 0, i64 2), align 8, !tbaa !1
+  %10 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 3), align 4, !tbaa !1
+  %11 = load float* getelementptr inbounds ([4 x float]* @b, i64 0, i64 3), align 4, !tbaa !1
+  %12 = fsub float %10, %11
+  store float %12, float* getelementptr inbounds ([4 x float]* @c, i64 0, i64 3), align 4, !tbaa !1
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 (209794)"}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"float", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}