Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -322,6 +322,7 @@
   enum ShuffleKind {
     SK_Broadcast,       ///< Broadcast element 0 to all other elements.
     SK_Reverse,         ///< Reverse the order of the vector.
+    SK_Alternate,       ///< Choose alternate elements from vector.
     SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
     SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset.
   };
Index: lib/CodeGen/BasicTargetTransformInfo.cpp
===================================================================
--- lib/CodeGen/BasicTargetTransformInfo.cpp
+++ lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -39,6 +39,9 @@
   /// are set if the result needs to be inserted and/or extracted from vectors.
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
+  /// Estimate the cost overhead of SK_Alternate shuffle.
+  unsigned getAltShuffleOverhead(Type *Ty) const;
+
   const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); }
 
 public:
@@ -327,8 +330,28 @@
   return OpCost;
 }
 
+unsigned BasicTTI::getAltShuffleOverhead(Type *Ty) const {
+  assert(Ty->isVectorTy() && "Can only shuffle vectors");
+  unsigned Cost = 0;
+  // Shuffle cost is equal to the cost of extracting element from its argument
+  // plus the cost of inserting them onto the result vector.
+
+  // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from index
+  // 0 of first vector, index 1 of second vector,index 2 of first vector and
+  // finally index 3 of second vector and insert them at index <0,1,2,3> of
+  // result vector.
+  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+    Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+  }
+  return Cost;
+}
+
 unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                                   Type *SubTp) const {
+  if (Kind == SK_Alternate) {
+    return getAltShuffleOverhead(Tp);
+  }
   return 1;
 }
 
Index: lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -443,31 +443,58 @@
 
 unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                                 Type *SubTp) const {
-  // We only handle costs of reverse shuffles for now.
-  if (Kind != SK_Reverse)
+  // We only handle costs of reverse and alternate shuffles for now.
+  if (Kind != SK_Reverse && Kind != SK_Alternate)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
-    // Reverse shuffle cost one instruction if we are shuffling within a double
-    // word (vrev) or two if we shuffle a quad word (vrev, vext).
-    { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 },
-    { ISD::VECTOR_SHUFFLE, MVT::v2f32, 1 },
-    { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 },
-    { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 },
-
-    { ISD::VECTOR_SHUFFLE, MVT::v4i32, 2 },
-    { ISD::VECTOR_SHUFFLE, MVT::v4f32, 2 },
-    { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 },
-    { ISD::VECTOR_SHUFFLE, MVT::v16i8, 2 }
-  };
-
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
-
-  int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
-  if (Idx == -1)
-    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
-
-  return LT.first * NEONShuffleTbl[Idx].Cost;
+  if (Kind == SK_Reverse) {
+    static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
+        // Reverse shuffle cost one instruction if we are shuffling within a
+        // double word (vrev) or two if we shuffle a quad word (vrev, vext).
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+    int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx == -1)
+      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+    return LT.first * NEONShuffleTbl[Idx].Cost;
+  }
+  if (Kind == SK_Alternate) {
+    static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = {
+        // Alt shuffle cost table for ARM. Cost is the number of instructions
+        // required to create the shuffled vector.
+
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    int Idx =
+        CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx == -1)
+      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+    return LT.first * NEONAltShuffleTbl[Idx].Cost;
+  }
+  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
 unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -402,17 +402,47 @@
 
 unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                                 Type *SubTp) const {
-  // We only estimate the cost of reverse shuffles.
-  if (Kind != SK_Reverse)
+  // We only estimate the cost of reverse and alternate shuffles.
+  if (Kind != SK_Reverse && Kind != SK_Alternate)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
-  unsigned Cost = 1;
-  if (LT.second.getSizeInBits() > 128)
-    Cost = 3; // Extract + insert + copy.
+  if (Kind == SK_Reverse) {
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    unsigned Cost = 1;
+    if (LT.second.getSizeInBits() > 128)
+      Cost = 3; // Extract + insert + copy.
 
-  // Multiple by the number of parts.
-  return Cost * LT.first;
+    // Multiple by the number of parts.
+    return Cost * LT.first;
+  }
+
+  if (Kind == SK_Alternate) {
+    static const CostTblEntry<MVT::SimpleValueType> X86AltShuffleTbl[] = {
+        // Alt shuffle cost table for X86. Cost is the number of instructions
+        // required to create the shuffled vector.
+
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 8},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 49}};
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+    int Idx = CostTableLookup(X86AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx == -1)
+      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+    return LT.first * X86AltShuffleTbl[Idx].Cost;
+  }
+
+  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
 unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -149,6 +149,48 @@
   return true;
 }
 
+///\returns Opcode that can be clubbed with \p Op to create an alternate
+/// sequence which can later be merged as a ShuffleVector instruction.
+static unsigned getAltOpcode(unsigned Op) {
+  switch (Op) {
+  case Instruction::FAdd:
+    return Instruction::FSub;
+  case Instruction::FSub:
+    return Instruction::FAdd;
+  case Instruction::Add:
+    return Instruction::Sub;
+  case Instruction::Sub:
+    return Instruction::Add;
+  default:
+    return 0;
+  }
+}
+
+///\returns bool representing if Opcode \p Op can be part
+/// of an alternate sequence which can later be merged as
+/// a ShuffleVector instruction.
+static bool canCombineAsAltInst(unsigned Op) {
+  if (Op == Instruction::FAdd || Op == Instruction::FSub ||
+      Op == Instruction::Sub || Op == Instruction::Add)
+    return true;
+  return false;
+}
+
+/// \returns ShuffleVector instruction if intructions in \p VL have
+///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
+/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
+static unsigned isAltInst(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  unsigned Opcode = I0->getOpcode();
+  unsigned AltOpcode = getAltOpcode(Opcode);
+  for (int i = 1, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
+      return 0;
+  }
+  return Instruction::ShuffleVector;
+}
+
 /// \returns The opcode if all of the Instructions in \p VL have the same
 /// opcode, or zero.
 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
@@ -158,8 +200,11 @@
   unsigned Opcode = I0->getOpcode();
   for (int i = 1, e = VL.size(); i < e; i++) {
     Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (!I || Opcode != I->getOpcode())
+    if (!I || Opcode != I->getOpcode()) {
+      if (canCombineAsAltInst(Opcode) && i == 1)
+        return isAltInst(VL);
       return 0;
+    }
   }
   return Opcode;
 }
@@ -377,6 +422,7 @@
 
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
+
 private:
   struct TreeEntry;
 
@@ -594,6 +640,7 @@
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   bool SameTy = getSameType(VL); (void)SameTy;
+  bool isAltShuffle = false;
   assert(SameTy && "Invalid types!");
 
   if (Depth == RecursionMaxDepth) {
@@ -615,10 +662,19 @@
       newTreeEntry(VL, false);
       return;
     }
+  unsigned Opcode = getSameOpcode(VL);
+
+  // Check that this shuffle vector refers to the alternate
+  // sequence of opcodes.
+  if (Opcode == Instruction::ShuffleVector) {
+    Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+    unsigned Op = I0->getOpcode();
+    if (Op != Instruction::ShuffleVector)
+      isAltShuffle = true;
+  }
 
   // If all of the operands are identical or constant we have a simple solution.
-  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
-      !getSameOpcode(VL)) {
+  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
     newTreeEntry(VL, false);
     return;
@@ -754,8 +810,6 @@
 
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
-  unsigned Opcode = getSameOpcode(VL);
-
   // Check if it is safe to sink the loads or the stores.
   if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
     Instruction *Last = getLastInstruction(VL);
@@ -1057,6 +1111,26 @@
       }
       return;
     }
+    case Instruction::ShuffleVector: {
+      // If this is not an alternate sequence of opcode like add-sub
+      // then do not vectorize this instruction.
+      if (!isAltShuffle) {
+        newTreeEntry(VL, false);
+        DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+        return;
+      }
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1);
+      }
+      return;
+    }
     default:
       newTreeEntry(VL, false);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
@@ -1080,11 +1154,9 @@
     }
     return getGatherCost(E->Scalars);
   }
-
-  assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
-         "Invalid VL");
+  unsigned Opcode = getSameOpcode(VL);
+  assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
   Instruction *VL0 = cast<Instruction>(VL[0]);
-  unsigned Opcode = VL0->getOpcode();
   switch (Opcode) {
     case Instruction::PHI: {
       return 0;
@@ -1242,6 +1314,32 @@
 
       return VecCallCost - ScalarCallCost;
     }
+    case Instruction::ShuffleVector: {
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_AnyValue;
+      int ScalarCost = 0;
+      int VecCost = 0;
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        Instruction *I = cast<Instruction>(VL[i]);
+        if (!I)
+          break;
+        ScalarCost +=
+            TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+      }
+      // VecCost is equal to sum of the cost of creating 2 vectors
+      // and the cost of creating shuffle.
+      Instruction *I0 = cast<Instruction>(VL[0]);
+      VecCost =
+          TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
+      Instruction *I1 = cast<Instruction>(VL[1]);
+      VecCost +=
+          TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
+      VecCost +=
+          TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
+      return VecCost - ScalarCost;
+    }
     default:
       llvm_unreachable("Unknown instruction");
   }
@@ -1522,9 +1620,7 @@
     setInsertPointAfterBundle(E->Scalars);
     return Gather(E->Scalars, VecTy);
   }
-
-  unsigned Opcode = VL0->getOpcode();
-  assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
+  unsigned Opcode = getSameOpcode(E->Scalars);
 
   switch (Opcode) {
     case Instruction::PHI: {
@@ -1797,6 +1893,49 @@
       E->VectorizedValue = V;
       return V;
     }
+    case Instruction::ShuffleVector: {
+      ValueList LHSVL, RHSVL;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+        LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+        RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+      }
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      // Create a vector of LHS op1 RHS
+      BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
+      Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
+
+      // Create a vector of LHS op2 RHS
+      Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
+      BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
+      Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
+
+      // Create appropriate shuffle to take alternative operations from
+      // the vector.
+      std::vector<Constant *> Mask(E->Scalars.size());
+      unsigned e = E->Scalars.size();
+      for (unsigned i = 0; i < e; ++i) {
+        if (i & 1)
+          Mask[i] = Builder.getInt32(e + i);
+        else
+          Mask[i] = Builder.getInt32(i);
+      }
+
+      Value *ShuffleMask = ConstantVector::get(Mask);
+
+      Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+      E->VectorizedValue = V;
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
+      return V;
+    }
     default:
     llvm_unreachable("unknown inst");
   }
@@ -1865,7 +2004,6 @@
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
-
       // No need to handle users of gathered values.
       if (Entry->NeedToGather)
         continue;
@@ -2049,7 +2187,6 @@
     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
          e = po_end(&F.getEntryBlock()); it != e; ++it) {
       BasicBlock *BB = *it;
-
       // Vectorize trees that end at stores.
       if (unsigned count = collectStores(BB, R)) {
         (void)count;
Index: test/Transforms/SLPVectorizer/X86/addsub.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/addsub.ll
+++ test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -0,0 +1,193 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global [4 x i32] zeroinitializer, align 16
+@c = common global [4 x i32] zeroinitializer, align 16
+@d = common global [4 x i32] zeroinitializer, align 16
+@e = common global [4 x i32] zeroinitializer, align 16
+@a = common global [4 x i32] zeroinitializer, align 16
+@fb = common global [4 x float] zeroinitializer, align 16
+@fc = common global [4 x float] zeroinitializer, align 16
+@fa = common global [4 x float] zeroinitializer, align 16
+
+; CHECK-LABEL: @addsub
+; CHECK: %4 = add <4 x i32> %1, %0
+; CHECK: %5 = add <4 x i32> %4, %2
+; CHECK: %6 = sub <4 x i32> %4, %2
+; CHECK: %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK: %8 = add <4 x i32> %7, %3
+; CHECK: %9 = sub <4 x i32> %7, %3
+; CHECK: %10 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+
+; Function Attrs: nounwind uwtable
+define void @addsub() #0 {
+entry:
+  %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 0), align 16, !tbaa !1
+  %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 0), align 16, !tbaa !1
+  %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 0), align 16, !tbaa !1
+  %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 0), align 16, !tbaa !1
+  %add1 = add i32 %1, %0
+  %add = add i32 %add1, %2
+  %add2 = add i32 %add, %3
+  store i32 %add2, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 0), align 16, !tbaa !1
+  %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 1), align 4, !tbaa !1
+  %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 1), align 4, !tbaa !1
+  %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 1), align 4, !tbaa !1
+  %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 1), align 4, !tbaa !1
+  %add4.neg = add i32 %5, %4
+  %add3 = sub i32 %add4.neg, %6
+  %sub = sub i32 %add3, %7
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 1), align 4, !tbaa !1
+  %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 2), align 8, !tbaa !1
+  %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 2), align 8, !tbaa !1
+  %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 2), align 8, !tbaa !1
+  %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 2), align 8, !tbaa !1
+  %add6 = add i32 %9, %8
+  %add5 = add i32 %add6, %10
+  %add7 = add i32 %add5, %11
+  store i32 %add7, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 2), align 8, !tbaa !1
+  %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 3), align 4, !tbaa !1
+  %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 3), align 4, !tbaa !1
+  %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 3), align 4, !tbaa !1
+  %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 3), align 4, !tbaa !1
+  %add9.neg = add i32 %13, %12
+  %add8 = sub i32 %add9.neg, %14
+  %sub10 = sub i32 %add8, %15
+  store i32 %sub10, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 3), align 4, !tbaa !1
+  ret void
+}
+
+; CHECK-LABEL: @subadd
+; CHECK: %4 = add <4 x i32>
+; CHECK: %5 = sub <4 x i32> %4, %2
+; CHECK: %6 = add <4 x i32> %4, %2
+; CHECK: %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK: %8 = sub <4 x i32> %7, %3
+; CHECK: %9 = add <4 x i32> %7, %3
+; CHECK: %10 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+
+; Function Attrs: nounwind uwtable
+define void @subadd() #0 {
+entry:
+  %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 0), align 16, !tbaa !1
+  %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 0), align 16, !tbaa !1
+  %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 0), align 16, !tbaa !1
+  %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 0), align 16, !tbaa !1
+  %add1.neg = add i32 %1, %0
+  %add = sub i32 %add1.neg, %2
+  %sub = sub i32 %add, %3
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 0), align 16, !tbaa !1
+  %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 1), align 4, !tbaa !1
+  %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 1), align 4, !tbaa !1
+  %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 1), align 4, !tbaa !1
+  %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 1), align 4, !tbaa !1
+  %add3 = add i32 %5, %4
+  %add2 = add i32 %add3, %6
+  %add4 = add i32 %add2, %7
+  store i32 %add4, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 1), align 4, !tbaa !1
+  %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 2), align 8, !tbaa !1
+  %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 2), align 8, !tbaa !1
+  %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 2), align 8, !tbaa !1
+  %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 2), align 8, !tbaa !1
+  %add6.neg = add i32 %9, %8
+  %add5 = sub i32 %add6.neg, %10
+  %sub7 = sub i32 %add5, %11
+  store i32 %sub7, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 2), align 8, !tbaa !1
+  %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 3), align 4, !tbaa !1
+  %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 3), align 4, !tbaa !1
+  %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 3), align 4, !tbaa !1
+  %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 3), align 4, !tbaa !1
+  %add9 = add i32 %13, %12
+  %add8 = add i32 %add9, %14
+  %add10 = add i32 %add8, %15
+  store i32 %add10, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 3), align 4, !tbaa !1
+  ret void
+}
+
+; CHECK-LABEL: @faddfsub
+; CHECK: %2 = fadd <4 x float> %0, %1
+; CHECK: %3 = fsub <4 x float> %0, %1
+; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; Function Attrs: nounwind uwtable
+define void @faddfsub() #0 {
+entry:
+  %0 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 0), align 16, !tbaa !5
+  %1 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 0), align 16, !tbaa !5
+  %add = fadd float %0, %1
+  store float %add, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 0), align 16, !tbaa !5
+  %2 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 1), align 4, !tbaa !5
+  %3 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 1), align 4, !tbaa !5
+  %sub = fsub float %2, %3
+  store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 1), align 4, !tbaa !5
+  %4 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 2), align 8, !tbaa !5
+  %5 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 2), align 8, !tbaa !5
+  %add1 = fadd float %4, %5
+  store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 2), align 8, !tbaa !5
+  %6 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 3), align 4, !tbaa !5
+  %7 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 3), align 4, !tbaa !5
+  %sub2 = fsub float %6, %7
+  store float %sub2, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 3), align 4, !tbaa !5
+  ret void
+}
+
+; CHECK-LABEL: @fsubfadd
+; CHECK: %2 = fsub <4 x float> %0, %1
+; CHECK: %3 = fadd <4 x float> %0, %1
+; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; Function Attrs: nounwind uwtable
+define void @fsubfadd() #0 {
+entry:
+  %0 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 0), align 16, !tbaa !5
+  %1 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 0), align 16, !tbaa !5
+  %sub = fsub float %0, %1
+  store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 0), align 16, !tbaa !5
+  %2 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 1), align 4, !tbaa !5
+  %3 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 1), align 4, !tbaa !5
+  %add = fadd float %2, %3
+  store float %add, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 1), align 4, !tbaa !5
+  %4 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 2), align 8, !tbaa !5
+  %5 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 2), align 8, !tbaa !5
+  %sub1 = fsub float %4, %5
+  store float %sub1, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 2), align 8, !tbaa !5
+  %6 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 3), align 4, !tbaa !5
+  %7 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 3), align 4, !tbaa !5
+  %add2 = fadd float %6, %7
+  store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 3), align 4, !tbaa !5
+  ret void
+}
+
+; CHECK-LABEL: @No_faddfsub
+; CHECK-NOT: fadd <4 x float>
+; CHECK-NOT: fsub <4 x float>
+; CHECK-NOT: shufflevector
+; Function Attrs: nounwind uwtable
+define void @No_faddfsub() #0 {
+entry:
+  %0 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 0), align 16, !tbaa !5
+  %1 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 0), align 16, !tbaa !5
+  %add = fadd float %0, %1
+  store float %add, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 0), align 16, !tbaa !5
+  %2 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 1), align 4, !tbaa !5
+  %3 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 1), align 4, !tbaa !5
+  %add1 = fadd float %2, %3
+  store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 1), align 4, !tbaa !5
+  %4 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 2), align 8, !tbaa !5
+  %5 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 2), align 8, !tbaa !5
+  %add2 = fadd float %4, %5
+  store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 2), align 8, !tbaa !5
+  %6 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 3), align 4, !tbaa !5
+  %7 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 3), align 4, !tbaa !5
+  %sub = fsub float %6, %7
+  store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 3), align 4, !tbaa !5
+  ret void
+}
+
+attributes #0 = { nounwind}
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !6, metadata !6, i64 0}
+!6 = metadata !{metadata !"float", metadata !3, i64 0}