Index: lib/Target/ARM/ARMParallelDSP.cpp
===================================================================
--- lib/Target/ARM/ARMParallelDSP.cpp
+++ lib/Target/ARM/ARMParallelDSP.cpp
@@ -47,9 +47,9 @@
   struct BinOpChain;
   struct Reduction;
 
-  using OpChainList     = SmallVector<OpChain*, 8>;
+  using OpChainList     = SmallVector<std::unique_ptr<OpChain>, 8>;
   using ReductionList   = SmallVector<Reduction, 8>;
-  using ValueList       = SmallVector<Value*, 8>;
+  using ValueList       = SmallSetVector<Value*, 8>;
   using MemInstList     = SmallVector<Instruction*, 8>;
   using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
   using PMACPairList    = SmallVector<PMACPair, 8>;
@@ -59,25 +59,45 @@
   struct OpChain {
     Instruction   *Root;
     ValueList     AllValues;
-    MemInstList   VecLd;    // List of all load instructions.
-    MemLocList    MemLocs;  // All memory locations read by this tree.
+    MemInstList   VecLd;        // List of all load instructions.
+    MemLocList    MemReadLocs;  // All memory locations read by this tree.
+    MemLocList    MemWriteLocs; // All memory locations written.
     bool          ReadOnly = true;
 
+    OpChain(Instruction *I) : Root(I) { }
+
     OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
 
-    void SetMemoryLocations() {
+    void Finalise() {
+      AllValues.insert(Root);
       const auto Size = MemoryLocation::UnknownSize;
       for (auto *V : AllValues) {
         if (auto *I = dyn_cast<Instruction>(V)) {
-          if (I->mayWriteToMemory())
+          if (I->mayWriteToMemory()) {
+            assert(isa<StoreInst>(I) && "Expect only stores to write memory");
+            MemWriteLocs.push_back(MemoryLocation(
+                cast<StoreInst>(I)->getPointerOperand(), Size));
             ReadOnly = false;
-          if (auto *Ld = dyn_cast<LoadInst>(V))
-            MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
+          }
+          if (auto *Ld = dyn_cast<LoadInst>(V)) {
+            MemReadLocs.push_back(
+              MemoryLocation(Ld->getPointerOperand(), Size));
+          }
         }
       }
     }
 
+    MemLocList &Reads() {
+      return MemReadLocs;
+    }
+
+    MemLocList &Writes() {
+      return MemWriteLocs;
+    }
+
     unsigned size() const { return AllValues.size(); }
+
+    bool contains(Value *V) const { return AllValues.count(V) != 0; }
   };
 
   // 'BinOpChain' and 'Reduction' are just some bookkeeping data structures.
@@ -91,33 +111,119 @@
     BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
       OpChain(I, lhs), LHS(lhs), RHS(rhs) {
         for (auto *V : RHS)
-          AllValues.push_back(V);
+          AllValues.insert(V);
+      }
+  };
+
+  class ParallelChains {
+  protected:
+    SmallVector<Instruction*, 4> Roots;
+    std::map<Instruction*, unsigned> ChainMap;
+    OpChainList Chains;
+
+  public:
+    ParallelChains() { }
+
+    void addRoot(Instruction *I, OpChain *Chain) {
+      assert(!ChainMap.count(I) &&
+             "Root already added to parallel sequence");
+      ChainMap[I] = Roots.size();
+      Roots.push_back(I);
+      Chain->Finalise();
+      Chains.push_back(std::unique_ptr<OpChain>{Chain});
+    }
+
+    OpChain *getSequence(unsigned i) {
+      assert(ChainMap.count(Roots[i]) &&
+             "trying to get unknown sequence");
+      return Chains[i].get();
+    }
+
+    OpChainList &getAllCandidates() {
+      return Chains;
+    }
+
+    bool contains(Value *V) {
+      for (unsigned i = 0; i < Chains.size(); ++i) {
+        OpChain *C = Chains[i].get();
+        if (C->contains(V))
+          return true;
       }
+      return false;
+    }
   };
 
-  struct Reduction {
+  class SuperWord : public ParallelChains {
+  public:
+
+    SuperWord(Instruction *I, OpChain *Chain) {
+      ChainMap[I] = Roots.size();
+      Roots.push_back(I);
+      Chains.push_back(std::unique_ptr<OpChain>{Chain});
+      Chain->Finalise();
+    }
+
+    ~SuperWord() {
+      for (auto *I : Roots) {
+        ValueList &AllValues = Chains[ChainMap[I]].get()->AllValues;
+        AllValues.remove(I);
+        I->dropAllReferences();
+        I->removeFromParent();
+
+        for (auto VI = AllValues.rbegin(), E = AllValues.rend(); VI != E;
+             ++VI) {
+          Value *V = *VI;
+          if (V->hasNUses(0)) {
+            if (auto *I = dyn_cast<Instruction>(V)) {
+              I->dropAllReferences();
+              I->removeFromParent();
+            }
+          }
+        }
+      }
+    }
+
+    unsigned getNumLanes() const { return Roots.size(); }
+
+    unsigned getElementSize() const {
+      assert(!Roots.empty() && "Invalid access of Roots");
+
+      Type *Ty = Roots.front()->getType();
+      if (auto *Store = dyn_cast<StoreInst>(Roots.front()))
+        Ty = Store->getValueOperand()->getType();
+      return Ty->getPrimitiveSizeInBits();
+    }
+
+    unsigned getVectorLength() const {
+      unsigned Length = 32 / getElementSize();
+      return Length > Roots.size() ? Roots.size() : Length;
+    }
+  };
+
+  struct Reduction : public ParallelChains {
     PHINode         *Phi;             // The Phi-node from where we start
                                       // pattern matching.
     Instruction     *AccIntAdd;       // The accumulating integer add statement,
                                       // i.e, the reduction statement.
 
-    OpChainList     MACCandidates;    // The MAC candidates associated with
-                                      // this reduction statement.
-    Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
+    Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { }
+
+    bool isValidMemoryAccess();
   };
 
   class ARMParallelDSP : public LoopPass {
-    ScalarEvolution   *SE;
-    AliasAnalysis     *AA;
-    TargetLibraryInfo *TLI;
-    DominatorTree     *DT;
-    LoopInfo          *LI;
-    Loop              *L;
-    const DataLayout  *DL;
-    Module            *M;
+    ScalarEvolution   *SE = nullptr;
+    AliasAnalysis     *AA = nullptr;
+    TargetLibraryInfo *TLI = nullptr;
+    DominatorTree     *DT = nullptr;
+    Loop              *L = nullptr;
+    const DataLayout  *DL = nullptr;
+    Module            *M = nullptr;
 
     bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
+    bool AreSequentialStores(StoreInst *St0, StoreInst *St1,
+                             MemInstList &VecMem);
     PMACPairList CreateParallelMACPairs(OpChainList &Candidates);
     Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
                                  Instruction *Acc, Instruction *InsertAfter);
@@ -128,6 +234,8 @@
     /// exchange the halfwords of the second operand before performing the
     /// arithmetic.
     bool MatchSMLAD(Function &F);
+    void Parallelise(OpChain *ParallelInsts, unsigned BitWidth);
+    bool FindParallelChains();
 
   public:
     static char ID;
@@ -153,7 +261,6 @@
       AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
       TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
       auto &TPC = getAnalysis<TargetPassConfig>();
 
       BasicBlock *Header = TheLoop->getHeader();
@@ -188,11 +295,12 @@
         return false;
       }
 
-      LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
       bool Changes = false;
 
-      LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n\n");
+      LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
+      LLVM_DEBUG(dbgs() << "Function: " << F.getName() << ", " << *L << "\n");
       Changes = MatchSMLAD(F);
+      Changes |= FindParallelChains();
       return Changes;
     }
   };
@@ -206,7 +314,13 @@
 template<unsigned MaxBitWidth>
 static bool IsNarrowSequence(Value *V, ValueList &VL) {
   LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
-  ConstantInt *CInt;
+  ConstantInt *CInt = nullptr;
+
+  auto IsNarrowType = [&](Type *Ty) {
+    if (auto *PtrTy = dyn_cast<PointerType>(Ty))
+      Ty = PtrTy->getElementType();
+    return Ty->getPrimitiveSizeInBits() == MaxBitWidth;
+  };
 
   if (match(V, m_ConstantInt(CInt))) {
     // TODO: if a constant is used, it needs to fit within the bit width.
@@ -218,25 +332,28 @@
    return false;
 
   Value *Val, *LHS, *RHS;
-  if (match(V, m_Trunc(m_Value(Val)))) {
-    if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
-      return IsNarrowSequence<MaxBitWidth>(Val, VL);
-  } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
-    // TODO: we need to implement sadd16/sadd8 for this, which enables to
-    // also do the rewrite for smlad8.ll, but it is unsupported for now.
-    LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
-    return false;
-  } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
-    if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
-      LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
-        cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
-      return false;
+  if (match(I, m_Trunc(m_Value(Val)))) {
+    if (IsNarrowType(cast<TruncInst>(I)->getDestTy()))
+       return IsNarrowSequence<MaxBitWidth>(Val, VL);
+  } else if (match(I, m_Add(m_Value(LHS), m_Value(RHS)))) {
+    LLVM_DEBUG(dbgs() << "Found add in sequence: " << *V << "\n");
+    if (IsNarrowSequence<MaxBitWidth>(LHS, VL) &&
+        IsNarrowSequence<MaxBitWidth>(RHS, VL)) {
+      VL.insert(I);
+      return true;
     }
-
-    if (match(Val, m_Load(m_Value()))) {
-      LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
-      VL.push_back(Val);
-      VL.push_back(I);
+  } else if (match(I, m_ZExtOrSExt(m_Value(Val)))) {
+    if (IsNarrowType(cast<CastInst>(I)->getSrcTy()) &&
+        IsNarrowSequence<MaxBitWidth>(Val, VL)) {
+      VL.insert(I);
+      return true;
+    } else
+       LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
+         cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
+  } else if (auto *Ld = dyn_cast<LoadInst>(I)) {
+    if (Ld->isSimple() && IsNarrowType(Ld->getPointerOperandType())) {
+      LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; V->dump());
+     VL.insert(I);
       return true;
     }
   }
@@ -320,6 +437,19 @@
   return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, *DL, *SE);
 }
 
+bool ARMParallelDSP::AreSequentialStores(StoreInst *St0, StoreInst *St1,
+                                         MemInstList &VecMem) {
+  if (!St0 || !St1)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Are consecutive stores:\n";
+    dbgs() << "St0:"; St0->dump();
+    dbgs() << "St1:"; St1->dump();
+  );
+
+  return AreSequentialAccesses<StoreInst>(St0, St1, VecMem, *DL, *SE);
+}
+
 PMACPairList
 ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
   const unsigned Elems = Candidates.size();
@@ -332,8 +462,8 @@
   // We can compare all elements, but then we need to compare and evaluate
   // different solutions.
   for(unsigned i=0; i<Elems-1; i+=2) {
-    BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i]);
-    BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[i+1]);
+    BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i].get());
+    BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[i+1].get());
     const Instruction *Mul0 = PMul0->Root;
     const Instruction *Mul1 = PMul1->Root;
 
@@ -349,6 +479,11 @@
     const ValueList &Mul1_LHS = PMul1->LHS;
     const ValueList &Mul1_RHS = PMul1->RHS;
 
+    if (Mul0_LHS.size() != Mul1_LHS.size() ||
+        Mul0_RHS.size() != Mul1_RHS.size() ||
+        Mul0_LHS.size() != Mul1_RHS.size())
+      continue;
+
     if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) ||
         !AreSymmetrical(Mul0_RHS, Mul1_RHS))
       continue;
@@ -445,46 +580,45 @@
   );
 }
 
-static void AddMACCandidate(OpChainList &Candidates,
-                            const Instruction *Acc,
-                            Value *MulOp0, Value *MulOp1, int MulOpNum) {
-  Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
-  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
-  ValueList LHS;
-  ValueList RHS;
-  if (IsNarrowSequence<16>(MulOp0, LHS) &&
-      IsNarrowSequence<16>(MulOp1, RHS)) {
-    LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
-    Candidates.push_back(new BinOpChain(Mul, LHS, RHS));
-  }
-}
-
-static void MatchParallelMACSequences(Reduction &R,
-                                      OpChainList &Candidates) {
-  const Instruction *Acc = R.AccIntAdd;
+static void MatchParallelMACSequences(Reduction &R) {
+  Instruction *Acc = R.AccIntAdd;
   Value *A, *MulOp0, *MulOp1;
   LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
 
+  auto AddMACCandidate = [&](Instruction *Acc, Value *MulOp0, Value *MulOp1,
+                             int MulOpNum) {
+    Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
+    LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
+    ValueList LHS;
+    ValueList RHS;
+    if (IsNarrowSequence<16>(MulOp0, LHS) &&
+        IsNarrowSequence<16>(MulOp1, RHS)) {
+      LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
+      R.addRoot(Acc, new BinOpChain(Mul, LHS, RHS));
+    }
+  };
+
   // Pattern 1: the accumulator is the RHS of the mul.
   while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)),
                          m_Value(A)))){
-    AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);
+    AddMACCandidate(Acc, MulOp0, MulOp1, 0);
     Acc = dyn_cast<Instruction>(A);
   }
   // Pattern 2: the accumulator is the LHS of the mul.
   while(match(Acc, m_Add(m_Value(A),
                          m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
-    AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 1);
+    AddMACCandidate(Acc, MulOp0, MulOp1, 1);
     Acc = dyn_cast<Instruction>(A);
   }
 
   // The last mul in the chain has a slightly different pattern:
   // the mul is the first operand
   if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A))))
-    AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);
+    AddMACCandidate(Acc, MulOp0, MulOp1, 0);
 
   // Because we start at the bottom of the chain, and we work our way up,
   // the muls are added in reverse program order to the list.
+  OpChainList &Candidates = R.getAllCandidates();
   std::reverse(Candidates.begin(), Candidates.end());
 }
 
@@ -502,49 +636,69 @@
 
 // Check whether statements in the basic block that write to memory alias with
 // the memory locations accessed by the MAC-chains.
-// TODO: we need the read statements when we accept more complicated chains.
 static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
-                       Instructions &Writes, OpChainList &MACCandidates) {
+                       Instructions &Writes, ParallelChains &ParallelInsts) {
   LLVM_DEBUG(dbgs() << "Alias checks:\n");
-  for (auto *MAC : MACCandidates) {
-    LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
 
-    // At the moment, we allow only simple chains that only consist of reads,
-    // accumulate their result with an integer add, and thus that don't write
-    // memory, and simply bail if they do.
-    if (!MAC->ReadOnly)
-      return true;
+  auto DoAlias = [&](OpChain *C, Instructions &Insts, MemLocList &MemLocs) {
+    for (auto *I : Insts) {
+
+      // Any writes (stores) within ParallelInsts will be sequential and the
+      // same of the size, so they won't be accessing the same memory location.
+      if (I->mayWriteToMemory() && ParallelInsts.contains(I))
+        continue;
 
-    // Now for all writes in the basic block, check that they don't alias with
-    // the memory locations accessed by our MAC-chain:
-    for (auto *I : Writes) {
-      LLVM_DEBUG(dbgs() << "- "; I->dump());
-      assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
-      for (auto &MemLoc : MAC->MemLocs) {
+      for (auto &MemLoc : MemLocs) {
         if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
                                           ModRefInfo::ModRef))) {
-          LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
+          LLVM_DEBUG(dbgs() << "Yes, aliases found with: " << *I << "\n");
           return true;
         }
       }
     }
+    return false;
+  };
+
+  OpChainList &Candidates = ParallelInsts.getAllCandidates();
+  for (unsigned i = 0; i < Candidates.size(); ++i) {
+    OpChain *Chain = Candidates[i].get();
+    LLVM_DEBUG(dbgs() << "Root: " << *Chain->Root << "\n");
+
+    if (DoAlias(Chain, Writes, Chain->Reads()))
+      return true;
+
+    LLVM_DEBUG(dbgs() << "Chain reads are ok.\n");
+
+    if (Chain->ReadOnly)
+      continue;
+
+    if (DoAlias(Chain, Reads, Chain->Writes())) {
+      LLVM_DEBUG(dbgs() << "Chain writes interfere with reads.\n");
+      return true;
+    }
+    if (DoAlias(Chain, Writes, Chain->Writes())) {
+      LLVM_DEBUG(dbgs() << "Chain writes interfere with other writes.\n");
+      return true;
+    }
   }
 
   LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
   return false;
 }
 
-static bool CheckMACMemory(OpChainList &Candidates) {
-  for (auto *C : Candidates) {
+bool Reduction::isValidMemoryAccess() {
+  for (unsigned i = 0; i < Chains.size(); ++i) {
+    OpChain *C = Chains[i].get();
     // A mul has 2 operands, and a narrow op consist of sext and a load; thus
     // we expect at least 4 items in this operand value list.
     if (C->size() < 4) {
       LLVM_DEBUG(dbgs() << "Operand list too short.\n");
       return false;
     }
-    C->SetMemoryLocations();
     ValueList &LHS = static_cast<BinOpChain*>(C)->LHS;
     ValueList &RHS = static_cast<BinOpChain*>(C)->RHS;
+    if (LHS.size() != RHS.size())
+      return false;
 
     // Use +=2 to skip over the expected extend instructions.
     for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
@@ -555,6 +709,163 @@
   return true;
 }
 
+static LoadInst *CreateWideLoad(IRBuilder<NoFolder> &IRB, const Type *WideTy,
+                                LoadInst *VecLd) {
+  const unsigned AddrSpace = VecLd->getPointerAddressSpace();
+
+  Value *VecPtr = IRB.CreateBitCast(VecLd->getPointerOperand(),
+                                    WideTy->getPointerTo(AddrSpace));
+  return IRB.CreateAlignedLoad(VecPtr, VecLd->getAlignment());
+}
+
+static StoreInst *CreateWideStore(IRBuilder<NoFolder> &IRB, const Type *WideTy,
+                                  StoreInst *VecSt, Value *V) {
+  const unsigned AddrSpace = VecSt->getPointerAddressSpace();
+
+  Value *VecPtr = IRB.CreateBitCast(VecSt->getPointerOperand(),
+                                    WideTy->getPointerTo(AddrSpace));
+  return IRB.CreateAlignedStore(V, VecPtr, VecSt->getAlignment());
+}
+
+void ARMParallelDSP::Parallelise(OpChain *ParallelInsts,
+                                 unsigned BitWidth) {
+
+  ValueToValueMap WideInsts;
+  Instruction *InsertAfter = ParallelInsts->Root;
+  IRBuilder<NoFolder> Builder(L->getLoopLatch(),
+                              ++BasicBlock::iterator(InsertAfter));
+
+  auto CreateParallelBinOp = [&](Instruction *I, Intrinsic::ID IntNo) {
+    Function *DSPInst = Intrinsic::getDeclaration(M, IntNo);
+    Value *Args[] = { WideInsts[I->getOperand(0)],
+                      WideInsts[I->getOperand(1)] };
+    return Builder.CreateCall(DSPInst, Args);
+  };
+
+  Type *WideTy = Type::getInt32Ty(M->getContext());
+
+  for (auto *V : ParallelInsts->AllValues) {
+    LLVM_DEBUG(dbgs() << "Widening: " << *V << "\n");
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      switch (I->getOpcode()) {
+      case Instruction::Add: {
+        Intrinsic::ID SADD = BitWidth == 8 ?
+          Intrinsic::arm_sadd8 : Intrinsic::arm_sadd16;
+        WideInsts[I] = CreateParallelBinOp(I, SADD);
+        break;
+      }
+      case Instruction::Load:
+        WideInsts[I] = CreateWideLoad(Builder, WideTy, cast<LoadInst>(I));
+        break;
+      case Instruction::Store: {
+        auto *St = cast<StoreInst>(I);
+        WideInsts[I] = CreateWideStore(Builder, WideTy, St,
+                                       WideInsts[St->getValueOperand()]);
+        break;
+      }
+      }
+    }
+  }
+}
+
+/// Search the given loop for store instructions, then search up from them to
+/// find valid narrow sequences. From those, We then build maximal sets of
+/// sequential stores.
+bool ARMParallelDSP::FindParallelChains() {
+  SmallVector<OpChain*, 4> Candidates;
+  std::map<Instruction*, OpChain*> CandidateMap;
+
+  for (auto &I : *L->getLoopLatch()) {
+    if (!isa<StoreInst>(I))
+      continue;
+
+    ValueList VL;
+    Value *V = cast<StoreInst>(I).getValueOperand();
+    if (IsNarrowSequence<16>(V, VL)) {
+      LLVM_DEBUG(dbgs() << "NarrowSequence:\n");
+      LLVM_DEBUG(for (auto *V : VL) V->dump(););
+      Candidates.push_back(new OpChain(&I, VL));
+      CandidateMap[&I] = Candidates.back();
+    }
+  }
+
+  if (Candidates.empty())
+    return false;
+
+  // Check every store against the other stores to find sequential ones.
+  // Map stores to their base store (if it exists) and also record all the
+  // subsequent stores of that base. SequentialStores will use the base store
+  // as the key to a vector of it's subsequent, and sequential, accesses.
+  std::map<StoreInst*, StoreInst*> BaseStores;
+  std::map<StoreInst*, SmallVector<StoreInst*, 4>> SequentialStores;
+
+  for (auto *Cand0 : Candidates) {
+    for (auto *Cand1 : Candidates) {
+      if (Cand0->Root == Cand1->Root ||
+          !AreSymmetrical(Cand0->AllValues, Cand1->AllValues))
+        continue;
+
+      auto *St0 = cast<StoreInst>(Cand0->Root);
+      auto *St1 = cast<StoreInst>(Cand1->Root);
+      MemInstList VecMem;
+
+      if (AreSequentialStores(St0, St1, VecMem)) {
+        if (BaseStores.count(St0)) {
+          StoreInst *Base = BaseStores[St0];
+          BaseStores[St1] = Base;
+          SequentialStores[Base].push_back(St1);
+        } else {
+          BaseStores[St1] = St0;
+          SequentialStores[St0].push_back(St1);
+        }
+      }
+    }
+  }
+  if (SequentialStores.empty())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Found " << SequentialStores.size()
+             << " group(s) of sequential stores:\n";
+             for (auto &I : SequentialStores) {
+               LLVM_DEBUG(dbgs() << " " << *I.first << "\n");
+               for (auto *St : I.second)
+                  LLVM_DEBUG(dbgs() << " " << *St << "\n");
+              });
+
+  // We've found some sequential stores, which could be:
+  // i8 x 2,3,4, ...
+  // i16 x 2,3,4, ...
+  // So we need to find the element width to calculate the vector width. The
+  // vector width will then determine how we step over the list of stores as
+  // we parallelise their operand chains.
+  for (auto &I : SequentialStores) {
+    StoreInst *BaseStore = I.first;
+    SmallVectorImpl<StoreInst*> &SubsequentStores = I.second;
+    SuperWord SW(BaseStore, CandidateMap[BaseStore]);
+    for (auto *St : SubsequentStores)
+      SW.addRoot(St, CandidateMap[St]);
+
+    Instructions Reads;
+    Instructions Writes;
+    AliasCandidates(L->getLoopLatch(), Reads, Writes);
+    if (AreAliased(AA, Reads, Writes, SW))
+      continue;
+
+    unsigned VecLength = SW.getVectorLength();
+    unsigned NumVecInsts = SW.getNumLanes() / SW.getVectorLength();
+    unsigned BitWidth = SW.getElementSize();
+
+    LLVM_DEBUG(dbgs() << "Parallelising " << SW.getNumLanes()
+               << " chains using " << NumVecInsts << " intrinsics.\n");
+    LLVM_DEBUG(dbgs() << "Vector length = " << SW.getVectorLength() << "\n");
+
+    for (unsigned i = 0; i < NumVecInsts; ++i)
+      Parallelise(SW.getSequence(i*VecLength), BitWidth);
+  }
+
+  return true;
+}
+
 // Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
 // multiplications.
 // To use SMLAD:
@@ -598,15 +909,12 @@
   MatchReductions(F, L, Header, Reductions);
 
   for (auto &R : Reductions) {
-    OpChainList MACCandidates;
-    MatchParallelMACSequences(R, MACCandidates);
-    if (!CheckMACMemory(MACCandidates))
+    MatchParallelMACSequences(R);
+    if (!R.isValidMemoryAccess())
       continue;
 
-    R.MACCandidates = MACCandidates;
-
     LLVM_DEBUG(dbgs() << "MAC candidates:\n";
-      for (auto &M : R.MACCandidates)
+      for (auto &M : R.getAllCandidates())
         M->Root->dump();
       dbgs() << "\n";);
   }
@@ -618,28 +926,16 @@
   AliasCandidates(Header, Reads, Writes);
 
   for (auto &R : Reductions) {
-    if (AreAliased(AA, Reads, Writes, R.MACCandidates))
+    if (AreAliased(AA, Reads, Writes, R))
       return false;
-    PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
+    PMACPairList PMACPairs = CreateParallelMACPairs(R.getAllCandidates());
     Changed |= InsertParallelMACs(R, PMACPairs);
-    for (auto *C : R.MACCandidates)
-      delete C;
   }
 
   LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
   return Changed;
 }
 
-static void CreateLoadIns(IRBuilder<NoFolder> &IRB, Instruction *Acc,
-                          LoadInst **VecLd) {
-  const Type *AccTy = Acc->getType();
-  const unsigned AddrSpace = (*VecLd)->getPointerAddressSpace();
-
-  Value *VecPtr = IRB.CreateBitCast((*VecLd)->getPointerOperand(),
-                                    AccTy->getPointerTo(AddrSpace));
-  *VecLd = IRB.CreateAlignedLoad(VecPtr, (*VecLd)->getAlignment());
-}
-
 Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
                                              Instruction *Acc,
                                              Instruction *InsertAfter) {
@@ -652,9 +948,9 @@
                               ++BasicBlock::iterator(InsertAfter));
 
   // Replace the reduction chain with an intrinsic call
-  CreateLoadIns(Builder, Acc, &VecLd0);
-  CreateLoadIns(Builder, Acc, &VecLd1);
-  Value* Args[] = { VecLd0, VecLd1, Acc };
+  LoadInst *Ld0 = CreateWideLoad(Builder, Acc->getType(), VecLd0);
+  LoadInst *Ld1 = CreateWideLoad(Builder, Acc->getType(), VecLd1);
+  Value* Args[] = { Ld0, Ld1, Acc };
   Function *SMLAD = Intrinsic::getDeclaration(M, Intrinsic::arm_smlad);
   CallInst *Call = Builder.CreateCall(SMLAD, Args);
   NumSMLAD++;
Index: test/CodeGen/ARM/sadd16-alias.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/sadd16-alias.ll
@@ -0,0 +1,130 @@
+; RUN: opt -mtriple=thumbv8.main -mcpu=cortex-m33 -arm-parallel-dsp -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: two_seq_add_i16
+; CHECK-NOT: call i32 @llvm.arm.sadd
+define void @two_seq_add_i16(i32 %N, i16* readonly %A,
+                             i16* readonly %B, i16* nocapture %Out) {
+entry:
+  %cmp10 = icmp eq i32 %N, 0
+  br i1 %cmp10, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.011 = phi i32 [ %inc.1, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.011
+  %0 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.011
+  %1 = load i16, i16* %arrayidx1, align 2
+  %add = add i16 %1, %0
+  %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.011
+  store i16 %add, i16* %arrayidx4, align 2
+  %inc = add nuw i32 %i.011, 1
+  %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
+  %2 = load i16, i16* %arrayidx.1, align 2
+  %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
+  %3 = load i16, i16* %arrayidx1.1, align 2
+  %add.1 = add i16 %3, %2
+  %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc
+  store i16 %add.1, i16* %arrayidx4.1, align 2
+  %inc.1 = add i32 %i.011, 2
+  %exitcond.2 = icmp eq i32 %inc.1, %N
+  br i1 %exitcond.2, label %for.cond.cleanup, label %for.body
+}
+
+; Acc could alias with Out
+; CHECK-LABEL: @sadd16_acc
+; CHECK-NOT: call i32 @llvm.arm.sadd
+define void @sadd16_acc(i32 %N, i16* nocapture readonly %A, i16* nocapture readonly %B,
+                        i16* noalias nocapture %Out, i16* nocapture %Acc) {
+entry:
+  %cmp17 = icmp eq i32 %N, 0
+  br i1 %cmp17, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %.pre = load i16, i16* %Acc, align 2
+  %0 = add i32 %N, -1
+  %xtraiter = and i32 %N, 3
+  %1 = icmp ult i32 %0, 3
+  br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:                           ; preds = %for.body.preheader
+  %unroll_iter = sub i32 %N, %xtraiter
+  br label %for.body
+
+for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
+  %.unr = phi i16 [ %.pre, %for.body.preheader ], [ %add8.3, %for.body ]
+  %i.018.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
+
+for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
+  %2 = phi i16 [ %add8.epil, %for.body.epil ], [ %.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %i.018.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.018.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.018.epil
+  %3 = load i16, i16* %arrayidx.epil, align 2
+  %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.018.epil
+  %4 = load i16, i16* %arrayidx1.epil, align 2
+  %add.epil = add i16 %4, %3
+  %arrayidx4.epil = getelementptr inbounds i16, i16* %Out, i32 %i.018.epil
+  store i16 %add.epil, i16* %arrayidx4.epil, align 2
+  %add8.epil = add i16 %2, %add.epil
+  store i16 %add8.epil, i16* %Acc, align 2
+  %inc.epil = add nuw i32 %i.018.epil, 1
+  %epil.iter.sub = add i32 %epil.iter, -1
+  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
+  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.new
+  %5 = phi i16 [ %.pre, %for.body.preheader.new ], [ %add8.3, %for.body ]
+  %i.018 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
+  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018
+  %6 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.018
+  %7 = load i16, i16* %arrayidx1, align 2
+  %add = add i16 %7, %6
+  %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.018
+  store i16 %add, i16* %arrayidx4, align 2
+  %add8 = add i16 %5, %add
+  store i16 %add8, i16* %Acc, align 2
+  %inc = or i32 %i.018, 1
+  %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
+  %8 = load i16, i16* %arrayidx.1, align 2
+  %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
+  %9 = load i16, i16* %arrayidx1.1, align 2
+  %add.1 = add i16 %9, %8
+  %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc
+  store i16 %add.1, i16* %arrayidx4.1, align 2
+  %add8.1 = add i16 %add8, %add.1
+  store i16 %add8.1, i16* %Acc, align 2
+  %inc.1 = or i32 %i.018, 2
+  %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
+  %10 = load i16, i16* %arrayidx.2, align 2
+  %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1
+  %11 = load i16, i16* %arrayidx1.2, align 2
+  %add.2 = add i16 %11, %10
+  %arrayidx4.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.1
+  store i16 %add.2, i16* %arrayidx4.2, align 2
+  %add8.2 = add i16 %add8.1, %add.2
+  store i16 %add8.2, i16* %Acc, align 2
+  %inc.2 = or i32 %i.018, 3
+  %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
+  %12 = load i16, i16* %arrayidx.3, align 2
+  %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2
+  %13 = load i16, i16* %arrayidx1.3, align 2
+  %add.3 = add i16 %13, %12
+  %arrayidx4.3 = getelementptr inbounds i16, i16* %Out, i32 %inc.2
+  store i16 %add.3, i16* %arrayidx4.3, align 2
+  %add8.3 = add i16 %add8.2, %add.3
+  store i16 %add8.3, i16* %Acc, align 2
+  %inc.3 = add i32 %i.018, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
+}
Index: test/CodeGen/ARM/sadd16.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/sadd16.ll
@@ -0,0 +1,428 @@
+; RUN: opt -mtriple=thumbv8.main -mcpu=cortex-m33 -arm-parallel-dsp -S %s -o - | FileCheck %s
+
+; CHECK: @two_seq_add_i16
+define void @two_seq_add_i16(i32 %N, i16* readonly %A,
+                             i16* readonly %B, i16* noalias nocapture %Out) {
+entry:
+  %cmp10 = icmp eq i32 %N, 0
+  br i1 %cmp10, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+; CHECK-LABEL: for.body:
+; CHECK: [[B0:[^ ]+]] = bitcast i16* %arrayidx1 to i32*
+; CHECK: [[LD_B0:[^ ]+]] = load i32, i32* [[B0]], align 2
+; CHECK: [[A0:[^ ]+]] = bitcast i16* %arrayidx to i32*
+; CHECK: [[LD_A0:[^ ]+]] = load i32, i32* [[A0]], align 2
+; CHECK: [[RES:[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B0]], i32 [[LD_A0]])
+; CHECK: [[C0:[^ ]+]] = bitcast i16* %arrayidx4 to i32*
+; CHECK: store i32 [[RES]], i32* [[C0]], align 2
+for.body:
+  %i.011 = phi i32 [ %inc.1, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.011
+  %0 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.011
+  %1 = load i16, i16* %arrayidx1, align 2
+  %add = add i16 %1, %0
+  %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.011
+  store i16 %add, i16* %arrayidx4, align 2
+  %inc = add nuw i32 %i.011, 1
+  %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
+  %2 = load i16, i16* %arrayidx.1, align 2
+  %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
+  %3 = load i16, i16* %arrayidx1.1, align 2
+  %add.1 = add i16 %3, %2
+  %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc
+  store i16 %add.1, i16* %arrayidx4.1, align 2
+  %inc.1 = add i32 %i.011, 2
+  %exitcond.2 = icmp eq i32 %inc.1, %N
+  br i1 %exitcond.2, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK: @four_seq_add_i16
+define void @four_seq_add_i16(i32 %N, i16* noalias nocapture readonly %A,
+                              i16* noalias nocapture readonly %B,
+                              i16* noalias nocapture %Out) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  %0 = add i32 %N, -1
+  %xtraiter = and i32 %N, 3
+  %1 = icmp ult i32 %0, 3
+  br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:
+  %unroll_iter = sub i32 %N, %xtraiter
+  br label %for.body
+
+for.cond.cleanup.loopexit.unr-lcssa:
+  %i.013.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
+
+for.body.epil:
+  %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.013.unr
+  %2 = load i16, i16* %arrayidx.epil, align 2
+  %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.013.unr
+  %3 = load i16, i16* %arrayidx1.epil, align 2
+  %add.epil = add i16 %3, %2
+  %arrayidx4.epil = getelementptr inbounds i16, i16* %Out, i32 %i.013.unr
+  store i16 %add.epil, i16* %arrayidx4.epil, align 2
+  %inc.epil = add nuw i32 %i.013.unr, 1
+  %epil.iter.cmp = icmp eq i32 %xtraiter, 1
+  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil.1
+
+for.cond.cleanup:
+  ret void
+
+; CHECK-LABEL: for.body:
+; CHECK: [[B0:%[^ ]+]] = bitcast i16* %arrayidx1 to i32*
+; CHECK: [[LD_B0:%[^ ]+]] = load i32, i32* [[B0]], align 2
+; CHECK: [[A0:%[^ ]+]] = bitcast i16* %arrayidx to i32*
+; CHECK: [[LD_A0:%[^ ]+]] = load i32, i32* [[A0]], align 2
+; CHECK: [[SADD0:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B0]], i32 [[LD_A0]])
+; CHECK: [[C0:%[^ ]+]] = bitcast i16* %arrayidx4 to i32*
+; CHECK: store i32 [[SADD0]], i32* [[C0]], align 2
+
+; CHECK: [[B2:%[^ ]+]] = bitcast i16* %arrayidx1.2 to i32*
+; CHECK: [[LD_B2:%[^ ]+]] = load i32, i32* [[B2]], align 2
+; CHECK: [[A2:%[^ ]+]] = bitcast i16* %arrayidx.2 to i32*
+; CHECK: [[LD_A2:%[^ ]+]] = load i32, i32* [[A2]], align 2
+; CHECK: [[SADD1:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B2]], i32 [[LD_A2]])
+; CHECK: [[C2:%[^ ]+]] = bitcast i16* %arrayidx4.2 to i32*
+; CHECK: store i32 [[SADD1]], i32* [[C2]], align 2
+for.body:
+  %i.013 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
+  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.013
+  %4 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.013
+  %5 = load i16, i16* %arrayidx1, align 2
+  %add = add i16 %5, %4
+  %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.013
+  store i16 %add, i16* %arrayidx4, align 2
+  %inc = or i32 %i.013, 1
+  %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
+  %6 = load i16, i16* %arrayidx.1, align 2
+  %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
+  %7 = load i16, i16* %arrayidx1.1, align 2
+  %add.1 = add i16 %7, %6
+  %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc
+  store i16 %add.1, i16* %arrayidx4.1, align 2
+  %inc.1 = or i32 %i.013, 2
+  %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
+  %8 = load i16, i16* %arrayidx.2, align 2
+  %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1
+  %9 = load i16, i16* %arrayidx1.2, align 2
+  %add.2 = add i16 %9, %8
+  %arrayidx4.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.1
+  store i16 %add.2, i16* %arrayidx4.2, align 2
+  %inc.2 = or i32 %i.013, 3
+  %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
+  %10 = load i16, i16* %arrayidx.3, align 2
+  %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2
+  %11 = load i16, i16* %arrayidx1.3, align 2
+  %add.3 = add i16 %11, %10
+  %arrayidx4.3 = getelementptr inbounds i16, i16* %Out, i32 %inc.2
+  store i16 %add.3, i16* %arrayidx4.3, align 2
+  %inc.3 = add i32 %i.013, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
+
+for.body.epil.1:
+  %arrayidx.epil.1 = getelementptr inbounds i16, i16* %A, i32 %inc.epil
+  %12 = load i16, i16* %arrayidx.epil.1, align 2
+  %arrayidx1.epil.1 = getelementptr inbounds i16, i16* %B, i32 %inc.epil
+  %13 = load i16, i16* %arrayidx1.epil.1, align 2
+  %add.epil.1 = add i16 %13, %12
+  %arrayidx4.epil.1 = getelementptr inbounds i16, i16* %Out, i32 %inc.epil
+  store i16 %add.epil.1, i16* %arrayidx4.epil.1, align 2
+  %inc.epil.1 = add i32 %i.013.unr, 2
+  %epil.iter.cmp.1 = icmp eq i32 %xtraiter, 2
+  br i1 %epil.iter.cmp.1, label %for.cond.cleanup, label %for.body.epil.2
+
+for.body.epil.2:
+  %arrayidx.epil.2 = getelementptr inbounds i16, i16* %A, i32 %inc.epil.1
+  %14 = load i16, i16* %arrayidx.epil.2, align 2
+  %arrayidx1.epil.2 = getelementptr inbounds i16, i16* %B, i32 %inc.epil.1
+  %15 = load i16, i16* %arrayidx1.epil.2, align 2
+  %add.epil.2 = add i16 %15, %14
+  %arrayidx4.epil.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.epil.1
+  store i16 %add.epil.2, i16* %arrayidx4.epil.2, align 2
+  br label %for.cond.cleanup
+}
+
+; CHECK: @sadd16_restrict_acc
+define void @sadd16_restrict_acc(i32 %N, i16* nocapture readonly %A, i16* nocapture readonly %B,
+                                i16* noalias nocapture %Out, i16* noalias nocapture %Acc) {
+entry:
+  %cmp17 = icmp eq i32 %N, 0
+  br i1 %cmp17, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %.pre = load i16, i16* %Acc, align 2
+  %0 = add i32 %N, -1
+  %xtraiter = and i32 %N, 3
+  %1 = icmp ult i32 %0, 3
+  br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:                           ; preds = %for.body.preheader
+  %unroll_iter = sub i32 %N, %xtraiter
+  br label %for.body
+
+for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
+  %.unr = phi i16 [ %.pre, %for.body.preheader ], [ %add8.3, %for.body ]
+  %i.018.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
+
+for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
+  %2 = phi i16 [ %add8.epil, %for.body.epil ], [ %.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %i.018.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.018.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.018.epil
+  %3 = load i16, i16* %arrayidx.epil, align 2
+  %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.018.epil
+  %4 = load i16, i16* %arrayidx1.epil, align 2
+  %add.epil = add i16 %4, %3
+  %arrayidx4.epil = getelementptr inbounds i16, i16* %Out, i32 %i.018.epil
+  store i16 %add.epil, i16* %arrayidx4.epil, align 2
+  %add8.epil = add i16 %2, %add.epil
+  store i16 %add8.epil, i16* %Acc, align 2
+  %inc.epil = add nuw i32 %i.018.epil, 1
+  %epil.iter.sub = add i32 %epil.iter, -1
+  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
+  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
+  ret void
+
+; CHECK-LABEL: for.body:
+; CHECK: [[B0:%[^ ]+]] = bitcast i16* %arrayidx1 to i32*
+; CHECK: [[LD_B0:%[^ ]+]] = load i32, i32* [[B0]], align 2
+; CHECK: [[A0:%[^ ]+]] = bitcast i16* %arrayidx to i32*
+; CHECK: [[LD_A0:%[^ ]+]] = load i32, i32* [[A0]], align 2
+; CHECK: [[SADD0:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B0]], i32 [[LD_A0]])
+; CHECK: [[C0:%[^ ]+]] = bitcast i16* %arrayidx4 to i32*
+; CHECK: store i32 [[SADD0]], i32* [[C0]], align 2
+
+; CHECK: [[B2:%[^ ]+]] = bitcast i16* %arrayidx1.2 to i32*
+; CHECK: [[LD_B2:%[^ ]+]] = load i32, i32* [[B2]], align 2
+; CHECK: [[A2:%[^ ]+]] = bitcast i16* %arrayidx.2 to i32*
+; CHECK: [[LD_A2:%[^ ]+]] = load i32, i32* [[A2]], align 2
+; CHECK: [[SADD1:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B2]], i32 [[LD_A2]])
+; CHECK: [[C2:%[^ ]+]] = bitcast i16* %arrayidx4.2 to i32*
+; CHECK: store i32 [[SADD1]], i32* [[C2]], align 2
+for.body:                                         ; preds = %for.body, %for.body.preheader.new
+  %5 = phi i16 [ %.pre, %for.body.preheader.new ], [ %add8.3, %for.body ]
+  %i.018 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
+  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018
+  %6 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.018
+  %7 = load i16, i16* %arrayidx1, align 2
+  %add = add i16 %7, %6
+  %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.018
+  store i16 %add, i16* %arrayidx4, align 2
+  %add8 = add i16 %5, %add
+  store i16 %add8, i16* %Acc, align 2
+  %inc = or i32 %i.018, 1
+  %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
+  %8 = load i16, i16* %arrayidx.1, align 2
+  %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
+  %9 = load i16, i16* %arrayidx1.1, align 2
+  %add.1 = add i16 %9, %8
+  %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc
+  store i16 %add.1, i16* %arrayidx4.1, align 2
+  %add8.1 = add i16 %add8, %add.1
+  store i16 %add8.1, i16* %Acc, align 2
+  %inc.1 = or i32 %i.018, 2
+  %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
+  %10 = load i16, i16* %arrayidx.2, align 2
+  %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1
+  %11 = load i16, i16* %arrayidx1.2, align 2
+  %add.2 = add i16 %11, %10
+  %arrayidx4.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.1
+  store i16 %add.2, i16* %arrayidx4.2, align 2
+  %add8.2 = add i16 %add8.1, %add.2
+  store i16 %add8.2, i16* %Acc, align 2
+  %inc.2 = or i32 %i.018, 3
+  %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
+  %12 = load i16, i16* %arrayidx.3, align 2
+  %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2
+  %13 = load i16, i16* %arrayidx1.3, align 2
+  %add.3 = add i16 %13, %12
+  %arrayidx4.3 = getelementptr inbounds i16, i16* %Out, i32 %inc.2
+  store i16 %add.3, i16* %arrayidx4.3, align 2
+  %add8.3 = add i16 %add8.2, %add.3
+  store i16 %add8.3, i16* %Acc, align 2
+  %inc.3 = add i32 %i.018, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
+}
+
+; CHECK: @sadd16_unroll5
+define void @sadd16_unroll5(i16* nocapture readonly %A, i16* nocapture readonly %B,
+                            i16* noalias nocapture %Out) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+; CHECK-LABEL: for.body:
+; CHECK: [[B0:%[^ ]+]] = bitcast i16* %arrayidx1 to i32*
+; CHECK: [[LD_B0:%[^ ]+]] = load i32, i32* [[B0]], align 2
+; CHECK: [[A0:%[^ ]+]] = bitcast i16* %arrayidx to i32*
+; CHECK: [[LD_A0:%[^ ]+]] = load i32, i32* [[A0]], align 2
+; CHECK: [[SADD0:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B0]], i32 [[LD_A0]])
+; CHECK: [[C0:%[^ ]+]] = bitcast i16* %arrayidx4 to i32*
+; CHECK: store i32 [[SADD0]], i32* [[C0]], align 2
+
+; CHECK: [[B2:%[^ ]+]] = bitcast i16* %arrayidx1.2 to i32*
+; CHECK: [[LD_B2:%[^ ]+]] = load i32, i32* [[B2]], align 2
+; CHECK: [[A2:%[^ ]+]] = bitcast i16* %arrayidx.2 to i32*
+; CHECK: [[LD_A2:%[^ ]+]] = load i32, i32* [[A2]], align 2
+; CHECK: [[SADD1:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B2]], i32 [[LD_A2]])
+; CHECK: [[C2:%[^ ]+]] = bitcast i16* %arrayidx4.2 to i32*
+; CHECK: store i32 [[SADD1]], i32* [[C2]], align 2
+
+; CHECK-NOT: call i32 @llvm.arm.sadd
+; CHECK: br i1 %exitcond
+for.body:                                         ; preds = %for.body, %entry
+  %i.010 = phi i32 [ 0, %entry ], [ %inc.4, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
+  %0 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010
+  %1 = load i16, i16* %arrayidx1, align 2
+  %add = add i16 %1, %0
+  %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.010
+  store i16 %add, i16* %arrayidx4, align 2
+  %inc = add nuw nsw i32 %i.010, 1
+  %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
+  %2 = load i16, i16* %arrayidx.1, align 2
+  %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
+  %3 = load i16, i16* %arrayidx1.1, align 2
+  %add.1 = add i16 %3, %2
+  %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc
+  store i16 %add.1, i16* %arrayidx4.1, align 2
+  %inc.1 = add nuw nsw i32 %i.010, 2
+  %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
+  %4 = load i16, i16* %arrayidx.2, align 2
+  %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1
+  %5 = load i16, i16* %arrayidx1.2, align 2
+  %add.2 = add i16 %5, %4
+  %arrayidx4.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.1
+  store i16 %add.2, i16* %arrayidx4.2, align 2
+  %inc.2 = add nuw nsw i32 %i.010, 3
+  %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
+  %6 = load i16, i16* %arrayidx.3, align 2
+  %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2
+  %7 = load i16, i16* %arrayidx1.3, align 2
+  %add.3 = add i16 %7, %6
+  %arrayidx4.3 = getelementptr inbounds i16, i16* %Out, i32 %inc.2
+  store i16 %add.3, i16* %arrayidx4.3, align 2
+  %inc.3 = add nuw nsw i32 %i.010, 4
+  %arrayidx.4 = getelementptr inbounds i16, i16* %A, i32 %inc.3
+  %8 = load i16, i16* %arrayidx.4, align 2
+  %arrayidx1.4 = getelementptr inbounds i16, i16* %B, i32 %inc.3
+  %9 = load i16, i16* %arrayidx1.4, align 2
+  %add.4 = add i16 %9, %8
+  %arrayidx4.4 = getelementptr inbounds i16, i16* %Out, i32 %inc.3
+  store i16 %add.4, i16* %arrayidx4.4, align 2
+  %inc.4 = add nuw nsw i32 %i.010, 5
+  %exitcond.4 = icmp eq i32 %inc.4, 120
+  br i1 %exitcond.4, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK: @sadd16_unroll6
+define void @sadd16_unroll6(i16* nocapture readonly %A, i16* nocapture readonly %B,
+                            i16* noalias nocapture %Out) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+; CHECK-LABEL: for.body:
+; CHECK: [[B0:%[^ ]+]] = bitcast i16* %arrayidx1 to i32*
+; CHECK: [[LD_B0:%[^ ]+]] = load i32, i32* [[B0]], align 2
+; CHECK: [[A0:%[^ ]+]] = bitcast i16* %arrayidx to i32*
+; CHECK: [[LD_A0:%[^ ]+]] = load i32, i32* [[A0]], align 2
+; CHECK: [[SADD0:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B0]], i32 [[LD_A0]])
+; CHECK: [[C0:%[^ ]+]] = bitcast i16* %arrayidx4 to i32*
+; CHECK: store i32 [[SADD0]], i32* [[C0]], align 2
+
+; CHECK: [[B2:%[^ ]+]] = bitcast i16* %arrayidx1.2 to i32*
+; CHECK: [[LD_B2:%[^ ]+]] = load i32, i32* [[B2]], align 2
+; CHECK: [[A2:%[^ ]+]] = bitcast i16* %arrayidx.2 to i32*
+; CHECK: [[LD_A2:%[^ ]+]] = load i32, i32* [[A2]], align 2
+; CHECK: [[SADD1:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B2]], i32 [[LD_A2]])
+; CHECK: [[C2:%[^ ]+]] = bitcast i16* %arrayidx4.2 to i32*
+; CHECK: store i32 [[SADD1]], i32* [[C2]], align 2
+
+; CHECK: [[B3:%[^ ]+]] = bitcast i16* %arrayidx1.4 to i32*
+; CHECK: [[LD_B3:%[^ ]+]] = load i32, i32* [[B3]], align 2
+; CHECK: [[A3:%[^ ]+]] = bitcast i16* %arrayidx.4 to i32*
+; CHECK: [[LD_A3:%[^ ]+]] = load i32, i32* [[A3]], align 2
+; CHECK: [[SADD2:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B3]], i32 [[LD_A3]])
+; CHECK: [[C3:%[^ ]+]] = bitcast i16* %arrayidx4.4 to i32*
+; CHECK: store i32 [[SADD2]], i32* [[C3]], align 2
+for.body:                                         ; preds = %for.body, %entry
+  %i.010 = phi i32 [ 0, %entry ], [ %inc.5, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
+  %0 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010
+  %1 = load i16, i16* %arrayidx1, align 2
+  %add = add i16 %1, %0
+  %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.010
+  store i16 %add, i16* %arrayidx4, align 2
+  %inc = or i32 %i.010, 1
+  %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
+  %2 = load i16, i16* %arrayidx.1, align 2
+  %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
+  %3 = load i16, i16* %arrayidx1.1, align 2
+  %add.1 = add i16 %3, %2
+  %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc
+  store i16 %add.1, i16* %arrayidx4.1, align 2
+  %inc.1 = add nuw nsw i32 %i.010, 2
+  %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
+  %4 = load i16, i16* %arrayidx.2, align 2
+  %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1
+  %5 = load i16, i16* %arrayidx1.2, align 2
+  %add.2 = add i16 %5, %4
+  %arrayidx4.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.1
+  store i16 %add.2, i16* %arrayidx4.2, align 2
+  %inc.2 = add nuw nsw i32 %i.010, 3
+  %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
+  %6 = load i16, i16* %arrayidx.3, align 2
+  %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2
+  %7 = load i16, i16* %arrayidx1.3, align 2
+  %add.3 = add i16 %7, %6
+  %arrayidx4.3 = getelementptr inbounds i16, i16* %Out, i32 %inc.2
+  store i16 %add.3, i16* %arrayidx4.3, align 2
+  %inc.3 = add nuw nsw i32 %i.010, 4
+  %arrayidx.4 = getelementptr inbounds i16, i16* %A, i32 %inc.3
+  %8 = load i16, i16* %arrayidx.4, align 2
+  %arrayidx1.4 = getelementptr inbounds i16, i16* %B, i32 %inc.3
+  %9 = load i16, i16* %arrayidx1.4, align 2
+  %add.4 = add i16 %9, %8
+  %arrayidx4.4 = getelementptr inbounds i16, i16* %Out, i32 %inc.3
+  store i16 %add.4, i16* %arrayidx4.4, align 2
+  %inc.4 = add nuw nsw i32 %i.010, 5
+  %arrayidx.5 = getelementptr inbounds i16, i16* %A, i32 %inc.4
+  %10 = load i16, i16* %arrayidx.5, align 2
+  %arrayidx1.5 = getelementptr inbounds i16, i16* %B, i32 %inc.4
+  %11 = load i16, i16* %arrayidx1.5, align 2
+  %add.5 = add i16 %11, %10
+  %arrayidx4.5 = getelementptr inbounds i16, i16* %Out, i32 %inc.4
+  store i16 %add.5, i16* %arrayidx4.5, align 2
+  %inc.5 = add nuw nsw i32 %i.010, 6
+  %exitcond.5 = icmp eq i32 %inc.5, 120
+  br i1 %exitcond.5, label %for.cond.cleanup, label %for.body
+}
Index: test/CodeGen/ARM/sadd8.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/sadd8.ll
@@ -0,0 +1,83 @@
+; RUN: opt -mtriple=thumbv8.main -mcpu=cortex-m33 -arm-parallel-dsp -S %s -o - | FileCheck %s
+
+; CHECK: @four_seq_add_i8
+define void @four_seq_add_i8(i32 %N, i8* noalias nocapture readonly %A,
+                             i8* noalias nocapture readonly %B,
+                             i8* noalias nocapture %Out) local_unnamed_addr #0 {
+entry:
+  %cmp10 = icmp eq i32 %N, 0
+  br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %N, -1
+  %xtraiter = and i32 %N, 3
+  %1 = icmp ult i32 %0, 3
+  br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:                           ; preds = %for.body.preheader
+  %unroll_iter = sub i32 %N, %xtraiter
+  br label %for.body
+
+for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
+  %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
+
+for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
+  %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %arrayidx.epil = getelementptr inbounds i8, i8* %A, i32 %i.011.epil
+  %2 = load i8, i8* %arrayidx.epil, align 1
+  %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.011.epil
+  %3 = load i8, i8* %arrayidx1.epil, align 1
+  %add.epil = add i8 %3, %2
+  %arrayidx4.epil = getelementptr inbounds i8, i8* %Out, i32 %i.011.epil
+  store i8 %add.epil, i8* %arrayidx4.epil, align 1
+  %inc.epil = add nuw i32 %i.011.epil, 1
+  %epil.iter.sub = add i32 %epil.iter, -1
+  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
+  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
+  ret void
+
+; CHECK-NOT: call i32 @llvm.arm.sadd
+for.body:                                         ; preds = %for.body, %for.body.preheader.new
+  %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
+  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011
+  %4 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.011
+  %5 = load i8, i8* %arrayidx1, align 1
+  %add = add i8 %5, %4
+  %arrayidx4 = getelementptr inbounds i8, i8* %Out, i32 %i.011
+  store i8 %add, i8* %arrayidx4, align 1
+  %inc = or i32 %i.011, 1
+  %arrayidx.1 = getelementptr inbounds i8, i8* %A, i32 %inc
+  %6 = load i8, i8* %arrayidx.1, align 1
+  %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
+  %7 = load i8, i8* %arrayidx1.1, align 1
+  %add.1 = add i8 %7, %6
+  %arrayidx4.1 = getelementptr inbounds i8, i8* %Out, i32 %inc
+  store i8 %add.1, i8* %arrayidx4.1, align 1
+  %inc.1 = or i32 %i.011, 2
+  %arrayidx.2 = getelementptr inbounds i8, i8* %A, i32 %inc.1
+  %8 = load i8, i8* %arrayidx.2, align 1
+  %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
+  %9 = load i8, i8* %arrayidx1.2, align 1
+  %add.2 = add i8 %9, %8
+  %arrayidx4.2 = getelementptr inbounds i8, i8* %Out, i32 %inc.1
+  store i8 %add.2, i8* %arrayidx4.2, align 1
+  %inc.2 = or i32 %i.011, 3
+  %arrayidx.3 = getelementptr inbounds i8, i8* %A, i32 %inc.2
+  %10 = load i8, i8* %arrayidx.3, align 1
+  %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
+  %11 = load i8, i8* %arrayidx1.3, align 1
+  %add.3 = add i8 %11, %10
+  %arrayidx4.3 = getelementptr inbounds i8, i8* %Out, i32 %inc.2
+  store i8 %add.3, i8* %arrayidx4.3, align 1
+  %inc.3 = add i32 %i.011, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
+}