Index: lib/Target/ARM/ARMParallelDSP.cpp
===================================================================
--- lib/Target/ARM/ARMParallelDSP.cpp
+++ lib/Target/ARM/ARMParallelDSP.cpp
@@ -63,21 +63,16 @@
     Instruction   *Root;
     ValueList     AllValues;
     MemInstList   VecLd;    // List of all load instructions.
-    MemLocList    MemLocs;  // All memory locations read by this tree.
+    MemInstList   Loads;
     bool          ReadOnly = true;
 
     OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
     virtual ~OpChain() = default;
 
-    void SetMemoryLocations() {
-      const auto Size = LocationSize::unknown();
+    void PopulateLoads() {
       for (auto *V : AllValues) {
-        if (auto *I = dyn_cast<Instruction>(V)) {
-          if (I->mayWriteToMemory())
-            ReadOnly = false;
-          if (auto *Ld = dyn_cast<LoadInst>(V))
-            MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
-        }
+        if (auto *Ld = dyn_cast<LoadInst>(V))
+          Loads.push_back(Ld);
       }
     }
 
@@ -140,12 +135,11 @@
     std::map<LoadInst*, LoadInst*> LoadPairs;
     std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
 
-    bool RecordSequentialLoads(BasicBlock *BB);
+    bool RecordMemoryOps(BasicBlock *BB);
     bool InsertParallelMACs(Reduction &Reduction);
     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
-    LoadInst* CreateLoadIns(IRBuilder<NoFolder> &IRB,
-                            SmallVectorImpl<LoadInst*> &Loads,
-                            IntegerType *LoadTy);
+    LoadInst* CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+                             IntegerType *LoadTy);
     void CreateParallelMACPairs(Reduction &R);
     Instruction *CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
                                  SmallVectorImpl<LoadInst*> &VecLd1,
@@ -164,6 +158,12 @@
 
     ARMParallelDSP() : LoopPass(ID) { }
 
+    bool doInitialization(Loop *L, LPPassManager &LPM) override {
+      LoadPairs.clear();
+      WideLoads.clear();
+      return true;
+    }
+
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       LoopPass::getAnalysisUsage(AU);
       AU.addRequired<AssumptionCacheTracker>();
@@ -228,7 +228,7 @@
 
       if (!ST->isLittle()) {
         LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
-                             "ARMParallelDSP\n");
+                          << "ARMParallelDSP\n");
         return false;
       }
 
@@ -237,7 +237,7 @@
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
 
-      if (!RecordSequentialLoads(Header)) {
+      if (!RecordMemoryOps(Header)) {
         LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
         return false;
       }
@@ -314,11 +314,18 @@
   return true;
 }
 
-/// Iterate through the block and record base, offset pairs of loads as well as
-/// maximal sequences of sequential loads.
-bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *BB) {
+/// Iterate through the block and record base, offset pairs of loads which can
+/// be widened into a single load.
+bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   SmallVector<LoadInst*, 8> Loads;
+  SmallVector<Instruction*, 8> Writes;
+
+  // Collect loads and instruction that may write to memory. For now we only
+  // record loads which are simple, sign-extended and have a single user.
+  // TODO: Allow zero-extended loads.
   for (auto &I : *BB) {
+    if (I.mayWriteToMemory())
+      Writes.push_back(&I);
     auto *Ld = dyn_cast<LoadInst>(&I);
     if (!Ld || !Ld->isSimple() ||
         !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
@@ -326,13 +333,57 @@
     Loads.push_back(Ld);
   }
 
-  for (auto *Ld0 : Loads) {
-    for (auto *Ld1 : Loads) {
-      if (Ld0 == Ld1)
+  using InstSet = std::set<Instruction*>;
+  using DepMap = std::map<Instruction*, InstSet>;
+  DepMap RAWDeps;
+
+  // Record any writes that may alias a load.
+  const auto Size = LocationSize::unknown();
+  for (auto Read : Loads) {
+    for (auto Write : Writes) {
+      if (Read == Write)
+        continue;
+
+      MemoryLocation ReadLoc =
+        MemoryLocation(Read->getPointerOperand(), Size);
+
+      if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
+          ModRefInfo::ModRef)))
+        continue;
+      if (DT->dominates(Write, Read))
+        RAWDeps[Read].insert(Write);
+    }
+  }
+
+  // Check whether there's not a write between the two loads which would
+  // prevent them from being safely merged.
+  auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
+    LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset;
+    LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base;
+
+    if (RAWDeps.count(Dominated)) {
+      InstSet &WritesBefore = RAWDeps[Dominated];
+
+      for (auto Before : WritesBefore) {
+
+        // We can't move the second load backward, past a write, to merge
+        // with the first load.
+        if (DT->dominates(Dominator, Before))
+          return false;
+      }
+    }
+    return true;
+  };
+
+  // Record base, offset load pairs.
+  for (auto *Base : Loads) {
+    for (auto *Offset : Loads) {
+      if (Base == Offset)
         continue;
 
-      if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
-        LoadPairs[Ld0] = Ld1;
+      if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
+          SafeToPair(Base, Offset)) {
+        LoadPairs[Base] = Offset;
         break;
       }
     }
@@ -442,9 +493,9 @@
   for (auto &Pair : Reduction.PMACPairs) {
     BinOpChain *PMul0 = Pair.first;
     BinOpChain *PMul1 = Pair.second;
-    LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
-               dbgs() << "- "; PMul0->Root->dump();
-               dbgs() << "- "; PMul1->Root->dump());
+    LLVM_DEBUG(dbgs() << "Found parallel MACs:\n"
+               << "- " << *PMul0->Root << "\n"
+               << "- " << *PMul1->Root << "\n");
 
     Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
                           InsertAfter);
@@ -459,54 +510,6 @@
   return false;
 }
 
-static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
-                            ReductionList &Reductions) {
-  RecurrenceDescriptor RecDesc;
-  const bool HasFnNoNaNAttr =
-    F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
-  const BasicBlock *Latch = TheLoop->getLoopLatch();
-
-  for (PHINode &Phi : Header->phis()) {
-    const auto *Ty = Phi.getType();
-    if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
-      continue;
-
-    const bool IsReduction =
-      RecurrenceDescriptor::AddReductionVar(&Phi,
-                                            RecurrenceDescriptor::RK_IntegerAdd,
-                                            TheLoop, HasFnNoNaNAttr, RecDesc);
-    if (!IsReduction)
-      continue;
-
-    Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
-    if (!Acc)
-      continue;
-
-    Reductions.push_back(Reduction(&Phi, Acc));
-  }
-
-  LLVM_DEBUG(
-    dbgs() << "\nAccumulating integer additions (reductions) found:\n";
-    for (auto &R : Reductions) {
-      dbgs() << "-  "; R.Phi->dump();
-      dbgs() << "-> "; R.AccIntAdd->dump();
-    }
-  );
-}
-
-static void AddMACCandidate(OpChainList &Candidates,
-                            Instruction *Mul,
-                            Value *MulOp0, Value *MulOp1) {
-  assert(Mul->getOpcode() == Instruction::Mul &&
-         "expected mul instruction");
-  ValueList LHS;
-  ValueList RHS;
-  if (IsNarrowSequence<16>(MulOp0, LHS) &&
-      IsNarrowSequence<16>(MulOp1, RHS)) {
-    Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
-  }
-}
-
 static void MatchParallelMACSequences(Reduction &R,
                                       OpChainList &Candidates) {
   Instruction *Acc = R.AccIntAdd;
@@ -528,8 +531,14 @@
     case Instruction::Mul: {
       Value *MulOp0 = I->getOperand(0);
       Value *MulOp1 = I->getOperand(1);
-      if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1))
-        AddMACCandidate(Candidates, I, MulOp0, MulOp1);
+      if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
+        ValueList LHS;
+        ValueList RHS;
+        if (IsNarrowSequence<16>(MulOp0, LHS) &&
+            IsNarrowSequence<16>(MulOp1, RHS)) {
+          Candidates.push_back(make_unique<BinOpChain>(I, LHS, RHS));
+        }
+      }
       return false;
     }
     case Instruction::SExt:
@@ -543,52 +552,6 @@
              << Candidates.size() << " candidates.\n");
 }
 
-// Collects all instructions that are not part of the MAC chains, which is the
-// set of instructions that can potentially alias with the MAC operands.
-static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
-                            Instructions &Writes) {
-  for (auto &I : *Header) {
-    if (I.mayReadFromMemory())
-      Reads.push_back(&I);
-    if (I.mayWriteToMemory())
-      Writes.push_back(&I);
-  }
-}
-
-// Check whether statements in the basic block that write to memory alias with
-// the memory locations accessed by the MAC-chains.
-// TODO: we need the read statements when we accept more complicated chains.
-static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
-                       Instructions &Writes, OpChainList &MACCandidates) {
-  LLVM_DEBUG(dbgs() << "Alias checks:\n");
-  for (auto &MAC : MACCandidates) {
-    LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
-
-    // At the moment, we allow only simple chains that only consist of reads,
-    // accumulate their result with an integer add, and thus that don't write
-    // memory, and simply bail if they do.
-    if (!MAC->ReadOnly)
-      return true;
-
-    // Now for all writes in the basic block, check that they don't alias with
-    // the memory locations accessed by our MAC-chain:
-    for (auto *I : Writes) {
-      LLVM_DEBUG(dbgs() << "- "; I->dump());
-      assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
-      for (auto &MemLoc : MAC->MemLocs) {
-        if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
-                                          ModRefInfo::ModRef))) {
-          LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
-          return true;
-        }
-      }
-    }
-  }
-
-  LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
-  return false;
-}
-
 static bool CheckMACMemory(OpChainList &Candidates) {
   for (auto &C : Candidates) {
     // A mul has 2 operands, and a narrow op consist of sext and a load; thus
@@ -597,7 +560,7 @@
       LLVM_DEBUG(dbgs() << "Operand list too short.\n");
       return false;
     }
-    C->SetMemoryLocations();
+    C->PopulateLoads();
     ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
     ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
 
@@ -643,14 +606,36 @@
 // before the loop begins.
 //
 bool ARMParallelDSP::MatchSMLAD(Function &F) {
-  BasicBlock *Header = L->getHeader();
-  LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
-             dbgs() << "Header block:\n"; Header->dump();
-             dbgs() << "Loop info:\n\n"; L->dump());
 
-  bool Changed = false;
+  auto FindReductions = [&](ReductionList &Reductions) {
+    RecurrenceDescriptor RecDesc;
+    const bool HasFnNoNaNAttr =
+      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+    BasicBlock *Latch = L->getLoopLatch();
+
+    for (PHINode &Phi : Latch->phis()) {
+      const auto *Ty = Phi.getType();
+      if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+        continue;
+
+      const bool IsReduction = RecurrenceDescriptor::AddReductionVar(
+        &Phi, RecurrenceDescriptor::RK_IntegerAdd, L, HasFnNoNaNAttr, RecDesc);
+
+      if (!IsReduction)
+        continue;
+
+      Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
+      if (!Acc)
+        continue;
+
+      Reductions.push_back(Reduction(&Phi, Acc));
+    }
+    return !Reductions.empty();
+  };
+
   ReductionList Reductions;
-  MatchReductions(F, L, Header, Reductions);
+  if (!FindReductions(Reductions))
+    return false;
 
   for (auto &R : Reductions) {
     OpChainList MACCandidates;
@@ -666,72 +651,79 @@
       dbgs() << "\n";);
   }
 
-  // Collect all instructions that may read or write memory. Our alias
-  // analysis checks bail out if any of these instructions aliases with an
-  // instruction from the MAC-chain.
-  Instructions Reads, Writes;
-  AliasCandidates(Header, Reads, Writes);
-
+  bool Changed = false;
+  // Check whether statements in the basic block that write to memory alias
+  // with the memory locations accessed by the MAC-chains.
   for (auto &R : Reductions) {
-    if (AreAliased(AA, Reads, Writes, R.MACCandidates))
-      return false;
     CreateParallelMACPairs(R);
     Changed |= InsertParallelMACs(R);
   }
 
-  LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
   return Changed;
 }
 
-LoadInst* ARMParallelDSP::CreateLoadIns(IRBuilder<NoFolder> &IRB,
-                                        SmallVectorImpl<LoadInst*> &Loads,
-                                        IntegerType *LoadTy) {
+LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+                                         IntegerType *LoadTy) {
   assert(Loads.size() == 2 && "currently only support widening two loads");
- 
-  const unsigned AddrSpace = Loads[0]->getPointerAddressSpace();
-  Value *VecPtr = IRB.CreateBitCast(Loads[0]->getPointerOperand(),
+
+  LoadInst *Base = Loads[0];
+  LoadInst *Offset = Loads[1];
+
+  Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
+  Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
+
+  assert((Base->hasOneUse() && Offset->hasOneUse() && BaseSExt && OffsetSExt)
+         && "Loads should have a single, extending, user");
+
+  std::function<void(Value*, Value*)> MoveBefore =
+    [&](Value *A, Value *B) -> void {
+      if (!isa<Instruction>(A) || !isa<Instruction>(B))
+        return;
+
+      auto *Source = cast<Instruction>(A);
+      auto *Sink = cast<Instruction>(B);
+
+      if (DT->dominates(Source, Sink) ||
+          Source->getParent() != Sink->getParent() ||
+          isa<PHINode>(Source) || isa<PHINode>(Sink))
+        return;
+
+      Source->moveBefore(Sink);
+      for (auto &U : Source->uses())
+        MoveBefore(Source, U.getUser());
+    };
+
+  // Insert the load at the point of the original dominating load.
+  LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
+  IRBuilder<NoFolder> IRB(DomLoad->getParent(),
+                          ++BasicBlock::iterator(DomLoad));
+
+  // Bitcast the pointer to a wider type and create the wide load, while making
+  // sure to maintain the original alignment as this prevents ldrd from being
+  // generated when it could be illegal due to memory alignment.
+  const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
+  Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
                                     LoadTy->getPointerTo(AddrSpace));
   LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
-                                             Loads[0]->getAlignment());
-  // Fix up users, Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
-  Instruction *SExt0 = dyn_cast<SExtInst>(Loads[0]->user_back());
-  Instruction *SExt1 = dyn_cast<SExtInst>(Loads[1]->user_back());
-
-  assert((Loads[0]->hasOneUse() && Loads[1]->hasOneUse() && SExt0 && SExt1) &&
-         "Loads should have a single, extending, user");
-
-  std::function<void(Instruction*, Instruction*)> MoveAfter =
-    [&](Instruction* Source, Instruction* Sink) -> void {
-    if (DT->dominates(Source, Sink) ||
-        Source->getParent() != Sink->getParent() ||
-        isa<PHINode>(Source) || isa<PHINode>(Sink))
-      return;
-
-    Sink->moveAfter(Source);
-    for (auto &U : Sink->uses())
-      MoveAfter(Sink, cast<Instruction>(U.getUser()));
-  };
+                                             Base->getAlignment());
+
+  // Make sure everything is in the correct order in the basic block.
+  MoveBefore(Base->getPointerOperand(), VecPtr);
+  MoveBefore(VecPtr, WideLoad);
 
   // From the wide load, create two values that equal the original two loads.
-  Value *Bottom = IRB.CreateTrunc(WideLoad, Loads[0]->getType());
-  SExt0->setOperand(0, Bottom);
-  if (auto *I = dyn_cast<Instruction>(Bottom)) {
-    I->moveAfter(WideLoad);
-    MoveAfter(I, SExt0);
-  }
+  // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
+  // TODO: Support big-endian as well.
+  Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
+  BaseSExt->setOperand(0, Bottom);
 
-  IntegerType *Ld1Ty = cast<IntegerType>(Loads[1]->getType());
-  Value *ShiftVal = ConstantInt::get(LoadTy, Ld1Ty->getBitWidth());
+  IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
+  Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
   Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
-  if (auto *I = dyn_cast<Instruction>(Top))
-    MoveAfter(WideLoad, I);
+  Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
+  OffsetSExt->setOperand(0, Trunc);
 
-  Value *Trunc = IRB.CreateTrunc(Top, Ld1Ty);
-  SExt1->setOperand(0, Trunc);
-  if (auto *I = dyn_cast<Instruction>(Trunc))
-    MoveAfter(I, SExt1);
-
-  WideLoads.emplace(std::make_pair(Loads[0],
+  WideLoads.emplace(std::make_pair(Base,
                                    make_unique<WidenedLoad>(Loads, WideLoad)));
   return WideLoad;
 }
@@ -748,15 +740,13 @@
              << "- " << *Acc << "\n"
              << "- Exchange: " << Exchange << "\n");
 
-  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
-                              ++BasicBlock::iterator(InsertAfter));
-
   // Replace the reduction chain with an intrinsic call
   IntegerType *Ty = IntegerType::get(M->getContext(), 32);
   LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
-    WideLoads[VecLd0[0]]->getLoad() : CreateLoadIns(Builder, VecLd0, Ty);
+    WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty);
   LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
-    WideLoads[VecLd1[0]]->getLoad() : CreateLoadIns(Builder, VecLd1, Ty);
+    WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty);
+
   Value* Args[] = { WideLd0, WideLd1, Acc };
   Function *SMLAD = nullptr;
   if (Exchange)
@@ -767,6 +757,9 @@
     SMLAD = Acc->getType()->isIntegerTy(32) ?
       Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
       Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
+
+  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+                              ++BasicBlock::iterator(InsertAfter));
   CallInst *Call = Builder.CreateCall(SMLAD, Args);
   NumSMLAD++;
   return Call;
Index: test/CodeGen/ARM/ParallelDSP/aliasing.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/ParallelDSP/aliasing.ll
@@ -0,0 +1,506 @@
+; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -verify -S | FileCheck %s
+;
+; Alias check: check that the rewrite isn't triggered when there's a store
+; instruction possibly aliasing any mul load operands; arguments are passed
+; without 'restrict' enabled.
+;
+; CHECK-NOT:  call i32 @llvm.arm.smlad
+;
+define dso_local i32 @no_restrict(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+
+; Store inserted here, aliasing with arrayidx, arrayidx1, arrayidx3
+  store i16 42, i16* %arrayidx, align 2
+
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; Alias check: check that the rewrite isn't triggered when there's a store
+; aliasing one of the mul load operands. Arguments are now annotated with
+; 'noalias'.
+;
+; CHECK-NOT:  call i32 @llvm.arm.smlad
+;
+define dso_local i32 @restrict(i32 %arg, i32* noalias %arg1, i16* noalias readonly %arg2, i16* noalias readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+
+; Store inserted here, aliasing only with loads from 'arrayidx'.
+  store i16 42, i16* %arrayidx, align 2
+
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+
+; Here the Mul is the LHS, and the Add the RHS.
+  %add11 = add i32 %mul9, %add10
+
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_dominates_all
+; CHECK: store
+; CHECK: load
+; CHECK: load
+; CHECK: load
+; CHECK: load
+; CHECK: smlad
+define dso_local i32 @store_dominates_all(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  store i16 42, i16* %arrayidx, align 2
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: loads_dominate
+; CHECK-NOT: store 
+; CHECK: load i32
+; CHECK-NOT: store
+; CHECK: load i32
+; CHECK: store
+define dso_local i32 @loads_dominate(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  store i16 42, i16* %arrayidx, align 2
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg3_legal_1
+; CHECK-NOT: store
+; CHECK: phi i32
+; CHECK: [[IV:%[^ ]+]] = phi i32 [ %add
+; CHECK: [[ARG3_GEP:%[^ ]+]] = getelementptr inbounds i16, i16* %arg3, i32 [[IV]]
+; CHECK: [[ARG3:%[^ ]+]] = bitcast i16* [[ARG3_GEP]] to i32*
+; CHECK: load i32, i32* [[ARG3]]
+; CHECK: [[ARG2_GEP:%[^ ]+]] = getelementptr inbounds i16, i16* %arg2, i32 [[IV]]
+; CHECK: [[ARG2:%[^ ]+]] = bitcast i16* [[ARG2_GEP]] to i32*
+; CHECK: load i32, i32* [[ARG2]]
+; CHECK: store
+define dso_local i32 @store_alias_arg3_legal_1(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  store i16 42, i16* %arrayidx, align 2
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg3_legal_2
+; CHECK-NOT: store
+; CHECK: [[BITCAST:[^ ]+]] = bitcast i16* %arrayidx to i32*
+; CHECK: load i32, i32* [[BITCAST]]
+; CHECK: store i16 42, i16* %arrayidx
+; CHECK: [[BITCAST3:[^ ]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK: load i32, i32* [[BITCAST3]]
+; CHECK: smlad
+define dso_local i32 @store_alias_arg3_legal_2(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  store i16 42, i16* %arrayidx, align 2
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg3_illegal_1
+; CHECK-NOT: load i32
+define dso_local i32 @store_alias_arg3_illegal_1(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* noalias nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  store i16 42, i16* %arrayidx1, align 2
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg3_illegal_2
+; CHECK-NOT: load i32
+define dso_local i32 @store_alias_arg3_illegal_2(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* noalias nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  store i16 42, i16* %arrayidx, align 2
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg2_illegal_1
+; CHECK-NOT: load i32
+define dso_local i32 @store_alias_arg2_illegal_1(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  store i16 42, i16* %arrayidx6, align 2
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg2_illegal_2
+; CHECK-NOT: load i32
+define dso_local i32 @store_alias_arg2_illegal_2(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  store i16 42, i16* %arrayidx3, align 2
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: one_pair_alias
+; FIXME: This tests shows we have a bug with smlad insertion
+define i32 @one_pair_alias(i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add26
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.050 = phi i32 [ 0, %entry ], [ %add27, %for.body ]
+  %a.049 = phi i32 [ 0, %entry ], [ %add26, %for.body ]
+  %add3 = or i32 %i.050, 1
+  %add11 = or i32 %i.050, 2
+  %add19 = or i32 %i.050, 3
+  %arrayidx = getelementptr inbounds i16, i16* %b, i32 %i.050
+  %arrayidx4 = getelementptr inbounds i16, i16* %b, i32 %add3
+  %arrayidx12 = getelementptr inbounds i16, i16* %b, i32 %add11
+  %arrayidx20 = getelementptr inbounds i16, i16* %b, i32 %add19
+  %arrayidx1 = getelementptr inbounds i16, i16* %c, i32 %i.050
+  %arrayidx7 = getelementptr inbounds i16, i16* %c, i32 %add3
+  %arrayidx15 = getelementptr inbounds i16, i16* %c, i32 %add11
+  %arrayidx23 = getelementptr inbounds i16, i16* %c, i32 %add19
+  %tmp = load i16, i16* %arrayidx, align 2
+  %tmp2 = load i16, i16* %arrayidx4, align 2
+  %tmp4 = load i16, i16* %arrayidx12, align 2
+  %tmp6 = load i16, i16* %arrayidx20, align 2
+  %tmp1 = load i16, i16* %arrayidx1, align 2
+  store i16 43, i16 *%arrayidx7
+  %tmp3 = load i16, i16* %arrayidx7, align 2
+  %tmp5 = load i16, i16* %arrayidx15, align 2
+  %tmp7 = load i16, i16* %arrayidx23, align 2
+  %conv = sext i16 %tmp to i32
+  %conv2 = sext i16 %tmp1 to i32
+  %mul = mul nsw i32 %conv2, %conv
+  %add = add nsw i32 %mul, %a.049
+  %conv5 = sext i16 %tmp2 to i32
+  %conv8 = sext i16 %tmp3 to i32
+  %mul9 = mul nsw i32 %conv8, %conv5
+  %add10 = add nsw i32 %add, %mul9
+  %conv13 = sext i16 %tmp4 to i32
+  %conv16 = sext i16 %tmp5 to i32
+  %mul17 = mul nsw i32 %conv16, %conv13
+  %add18 = add nsw i32 %add10, %mul17
+  %conv21 = sext i16 %tmp6 to i32
+  %conv24 = sext i16 %tmp7 to i32
+  %mul25 = mul nsw i32 %conv24, %conv21
+  %add26 = add nsw i32 %add18, %mul25
+  %add27 = add nuw nsw i32 %i.050, 4
+  %cmp = icmp ult i32 %add27, 100
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
Index: test/CodeGen/ARM/ParallelDSP/smlad0.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smlad0.ll
+++ test/CodeGen/ARM/ParallelDSP/smlad0.ll
@@ -11,11 +11,11 @@
 ;
 ; CHECK-LABEL: @OneReduction
 ; CHECK:  %mac1{{\.}}026 = phi i32 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK:  [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK:  [[V4:%[0-9]+]] = bitcast i16* %arrayidx to i32*
 ; CHECK:  [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
-; CHECK:  [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32*
+; CHECK:  [[V6:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
 ; CHECK:  [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2
-; CHECK:  [[V8]] = call i32 @llvm.arm.smlad(i32 [[V5]], i32 [[V7]], i32 %mac1{{\.}}026)
+; CHECK:  [[V8]] = call i32 @llvm.arm.smlad(i32 [[V7]], i32 [[V5]], i32 %mac1{{\.}}026)
 ; CHECK-NOT: call i32 @llvm.arm.smlad
 ;
 ; CHECK-UNSUPPORTED-NOT:  call i32 @llvm.arm.smlad
Index: test/CodeGen/ARM/ParallelDSP/smlad1.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smlad1.ll
+++ test/CodeGen/ARM/ParallelDSP/smlad1.ll
@@ -2,11 +2,11 @@
 
 ; CHECK-LABEL: @test1
 ; CHECK:  %mac1{{\.}}026 = phi i32 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK:  [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK:  [[V4:%[0-9]+]] = bitcast i16* %arrayidx to i32*
 ; CHECK:  [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
-; CHECK:  [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32*
+; CHECK:  [[V6:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
 ; CHECK:  [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2
-; CHECK:  [[V8]] = call i32 @llvm.arm.smlad(i32 [[V5]], i32 [[V7]], i32 %mac1{{\.}}026)
+; CHECK:  [[V8]] = call i32 @llvm.arm.smlad(i32 [[V7]], i32 [[V5]], i32 %mac1{{\.}}026)
 
 define dso_local i32 @test1(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
 entry:
Index: test/CodeGen/ARM/ParallelDSP/smlad11.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smlad11.ll
+++ test/CodeGen/ARM/ParallelDSP/smlad11.ll
@@ -4,15 +4,15 @@
 ; A more complicated chain: 4 mul operations, so we expect 2 smlad calls.
 ;
 ; CHECK:  %mac1{{\.}}054 = phi i32 [ [[V17:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK:  [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32*
-; CHECK:  [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2
 ; CHECK:  [[V10:%[0-9]+]] = bitcast i16* %arrayidx to i32*
 ; CHECK:  [[V11:%[0-9]+]] = load i32, i32* [[V10]], align 2
-; CHECK:  [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054)
-; CHECK:  [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32*
-; CHECK:  [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2
 ; CHECK:  [[V15:%[0-9]+]] = bitcast i16* %arrayidx4 to i32*
 ; CHECK:  [[V16:%[0-9]+]] = load i32, i32* [[V15]], align 2
+; CHECK:  [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32*
+; CHECK:  [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2
+; CHECK:  [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32*
+; CHECK:  [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2
+; CHECK:  [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054)
 ; CHECK:  [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[V12]])
 ;
 ; And we don't want to see a 3rd smlad:
Index: test/CodeGen/ARM/ParallelDSP/smlad6.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smlad6.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
-;
-; Alias check: check that the rewrite isn't triggered when there's a store
-; instruction possibly aliasing any mul load operands; arguments are passed
-; without 'restrict' enabled.
-;
-; CHECK-NOT:  call i32 @llvm.arm.smlad
-;
-define dso_local i32 @test(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
-entry:
-  %cmp24 = icmp sgt i32 %arg, 0
-  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %.pre = load i16, i16* %arg3, align 2
-  %.pre27 = load i16, i16* %arg2, align 2
-  br label %for.body
-
-for.cond.cleanup:
-  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
-  ret i32 %mac1.0.lcssa
-
-for.body:
-  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
-  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
-  %0 = load i16, i16* %arrayidx, align 2
-
-; Store inserted here, aliasing with arrayidx, arrayidx1, arrayidx3
-  store i16 42, i16* %arrayidx, align 2
-
-  %add = add nuw nsw i32 %i.025, 1
-  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
-  %1 = load i16, i16* %arrayidx1, align 2
-  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
-  %2 = load i16, i16* %arrayidx3, align 2
-  %conv = sext i16 %2 to i32
-  %conv4 = sext i16 %0 to i32
-  %mul = mul nsw i32 %conv, %conv4
-  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
-  %3 = load i16, i16* %arrayidx6, align 2
-  %conv7 = sext i16 %3 to i32
-  %conv8 = sext i16 %1 to i32
-  %mul9 = mul nsw i32 %conv7, %conv8
-  %add10 = add i32 %mul, %mac1.026
-  %add11 = add i32 %mul9, %add10
-  %exitcond = icmp ne i32 %add, %arg
-  br i1 %exitcond, label %for.body, label %for.cond.cleanup
-}
-
Index: test/CodeGen/ARM/ParallelDSP/smlad7.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smlad7.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
-;
-; Alias check: check that the rewrite isn't triggered when there's a store
-; aliasing one of the mul load operands. Arguments are now annotated with
-; 'noalias'.
-;
-; CHECK-NOT:  call i32 @llvm.arm.smlad
-;
-define dso_local i32 @test(i32 %arg, i32* noalias %arg1, i16* noalias readonly %arg2, i16* noalias readonly %arg3) {
-entry:
-  %cmp24 = icmp sgt i32 %arg, 0
-  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %.pre = load i16, i16* %arg3, align 2
-  %.pre27 = load i16, i16* %arg2, align 2
-  br label %for.body
-
-for.cond.cleanup:
-  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
-  ret i32 %mac1.0.lcssa
-
-for.body:
-  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
-  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
-  %0 = load i16, i16* %arrayidx, align 2
-
-; Store inserted here, aliasing only with loads from 'arrayidx'.
-  store i16 42, i16* %arrayidx, align 2
-
-  %add = add nuw nsw i32 %i.025, 1
-  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
-  %1 = load i16, i16* %arrayidx1, align 2
-  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
-  %2 = load i16, i16* %arrayidx3, align 2
-  %conv = sext i16 %2 to i32
-  %conv4 = sext i16 %0 to i32
-  %mul = mul nsw i32 %conv, %conv4
-  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
-  %3 = load i16, i16* %arrayidx6, align 2
-  %conv7 = sext i16 %3 to i32
-  %conv8 = sext i16 %1 to i32
-  %mul9 = mul nsw i32 %conv7, %conv8
-  %add10 = add i32 %mul, %mac1.026
-
-; Here the Mul is the LHS, and the Add the RHS.
-  %add11 = add i32 %mul9, %add10
-
-  %exitcond = icmp ne i32 %add, %arg
-  br i1 %exitcond, label %for.body, label %for.cond.cleanup
-}
-
Index: test/CodeGen/ARM/ParallelDSP/smladx-1.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smladx-1.ll
+++ test/CodeGen/ARM/ParallelDSP/smladx-1.ll
@@ -8,15 +8,15 @@
 ; CHECK-LABEL: smladx
 ; CHECK: = phi i32 [ 0, %for.body.preheader.new ],
 ; CHECK: [[ACC0:%[^ ]+]] = phi i32 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
+; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
+; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
+; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
+; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
 ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*
 ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2
 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*
 ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2
 ; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC0]])
-; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
-; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
-; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
-; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
 ; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC1]])
 ; CHECK-NOT: call i32 @llvm.arm.smlad
 ; CHECK-UNSUPPORTED-NOT:  call i32 @llvm.arm.smlad
@@ -124,20 +124,21 @@
 ; CHECK: [[IV:%[^ ]+]] = phi i32
 ; CHECK: [[ACC0:%[^ ]+]] = phi i32 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
 
-; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
-; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
+; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32*
+; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2
 
+; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
+; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
+; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
 
+; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32*
 ; CHECK: [[IN2_2:%[^ ]+]] = load i32, i32* [[PIN2_2_CAST]], align 2
+
 ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
 ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
-; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC0]])
 
-; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32*
-; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2
-; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
-; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
+; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC0]])
 ; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC1]])
 
 ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4
Index: test/CodeGen/ARM/ParallelDSP/smlald0.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smlald0.ll
+++ test/CodeGen/ARM/ParallelDSP/smlald0.ll
@@ -11,10 +11,10 @@
 ;
 ; CHECK-LABEL: @OneReduction
 ; CHECK:  %mac1{{\.}}026 = phi i64 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK:  [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
-; CHECK:  [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
 ; CHECK:  [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32*
 ; CHECK:  [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2
+; CHECK:  [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK:  [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
 ; CHECK:  [[V8]] = call i64 @llvm.arm.smlald(i32 [[V5]], i32 [[V7]], i64 %mac1{{\.}}026)
 ; CHECK-NOT: call i64 @llvm.arm.smlald
 ;
Index: test/CodeGen/ARM/ParallelDSP/smlald1.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smlald1.ll
+++ test/CodeGen/ARM/ParallelDSP/smlald1.ll
@@ -2,10 +2,10 @@
 
 ; CHECK-LABEL: @test1
 ; CHECK:  %mac1{{\.}}026 = phi i64 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK:  [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
-; CHECK:  [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
 ; CHECK:  [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32*
 ; CHECK:  [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2
+; CHECK:  [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK:  [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
 ; CHECK:  [[V8]] = call i64 @llvm.arm.smlald(i32 [[V5]], i32 [[V7]], i64 %mac1{{\.}}026)
 
 define dso_local i64 @test1(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
Index: test/CodeGen/ARM/ParallelDSP/smlald2.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smlald2.ll
+++ test/CodeGen/ARM/ParallelDSP/smlald2.ll
@@ -10,10 +10,10 @@
 ;
 ; CHECK-LABEL: @OneReduction
 ; CHECK:  %mac1{{\.}}026 = phi i64 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK:  [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
-; CHECK:  [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
 ; CHECK:  [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32*
 ; CHECK:  [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2
+; CHECK:  [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK:  [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
 ; CHECK:  [[V8]] = call i64 @llvm.arm.smlald(i32 [[V5]], i32 [[V7]], i64 %mac1{{\.}}026)
 ; CHECK-NOT: call i64 @llvm.arm.smlald
 ;
Index: test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll
+++ test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll
@@ -7,15 +7,15 @@
 ; CHECK-LABEL: smlaldx
 ; CHECK: = phi i32 [ 0, %for.body.preheader.new ],
 ; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
+; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
+; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
+; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
+; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
 ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*
 ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2
 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*
 ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2
 ; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]])
-; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
-; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
-; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
-; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
 ; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]])
 ; CHECK-NOT: call i64 @llvm.arm.smlad
 ; CHECK-UNSUPPORTED-NOT:  call i64 @llvm.arm.smlad
@@ -181,19 +181,21 @@
 ; CHECK: [[IV:%[^ ]+]] = phi i32
 ; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
 
+; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32*
+; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2
+
 ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
-; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
+; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
+; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
 
+; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32*
 ; CHECK: [[IN2_2:%[^ ]+]] = load i32, i32* [[PIN2_2_CAST]], align 2
+
 ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
 ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
-; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]])
 
-; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32*
-; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2
-; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
-; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
+; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]])
 ; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]])
 
 ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4
Index: test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll
+++ test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll
@@ -7,15 +7,15 @@
 ; CHECK-LABEL: smlaldx
 ; CHECK: = phi i32 [ 0, %for.body.preheader.new ],
 ; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
+; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
+; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
+; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
+; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
 ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*
 ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2
 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*
 ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2
 ; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]])
-; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
-; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
-; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
-; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
 ; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]])
 ; CHECK-NOT: call i64 @llvm.arm.smlad
 ; CHECK-UNSUPPORTED-NOT:  call i64 @llvm.arm.smlad
@@ -180,19 +180,22 @@
 ; CHECK: [[PIN1:%[^ ]+]] = phi i16* [ [[PIN1_NEXT:%[^ ]+]], %for.body ], [ [[PIN1Base]], %for.body.preheader.new ]
 ; CHECK: [[IV:%[^ ]+]] = phi i32
 ; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
-; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
-; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
 
 ; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32*
 ; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2
+
+; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
 ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
 ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
-; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]])
 
-; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
-; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
+; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32*
 ; CHECK: [[IN2_2:%[^ ]+]] = load i32, i32* [[PIN2_2_CAST]], align 2
+
+; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
+; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
+
+; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]])
 ; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN1]], i32 [[IN2_2]], i64 [[ACC1]])
 
 ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4