Index: lib/Target/ARM/ARMParallelDSP.cpp
===================================================================
--- lib/Target/ARM/ARMParallelDSP.cpp
+++ lib/Target/ARM/ARMParallelDSP.cpp
@@ -53,7 +53,7 @@
   using OpChainList     = SmallVector<std::unique_ptr<OpChain>, 8>;
   using ReductionList   = SmallVector<Reduction, 8>;
   using ValueList       = SmallVector<Value*, 8>;
-  using MemInstList     = SmallVector<Instruction*, 8>;
+  using MemInstList     = SmallVector<LoadInst*, 8>;
   using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
   using PMACPairList    = SmallVector<PMACPair, 8>;
   using Instructions    = SmallVector<Instruction*,16>;
@@ -113,6 +113,21 @@
     Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
   };
 
+  class WidenedLoad {
+    LoadInst *NewLd = nullptr;
+    SmallVector<LoadInst*, 4> Loads;
+
+  public:
+    WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
+      : NewLd(Wide) {
+      for (auto *I : Lds)
+        Loads.push_back(I);
+    }
+    LoadInst *getLoad() {
+      return NewLd;
+    }
+  };
+
   class ARMParallelDSP : public LoopPass {
     ScalarEvolution   *SE;
     AliasAnalysis     *AA;
@@ -123,12 +138,17 @@
     const DataLayout  *DL;
     Module            *M;
     std::map<LoadInst*, LoadInst*> LoadPairs;
+    std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
 
     bool PrepareForBlock(BasicBlock *BB);
     bool InsertParallelMACs(Reduction &Reduction);
     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
+    LoadInst* CreateLoadIns(IRBuilder<NoFolder> &IRB,
+                            SmallVectorImpl<LoadInst*> &Loads,
+                            IntegerType *LoadTy);
     void CreateParallelMACPairs(Reduction &R);
-    Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+    Instruction *CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
+                                 SmallVectorImpl<LoadInst*> &VecLd1,
                                  Instruction *Acc, bool Exchange,
                                  Instruction *InsertAfter);
 
@@ -225,7 +245,6 @@
 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
 template<unsigned MaxBitWidth>
 static bool IsNarrowSequence(Value *V, ValueList &VL) {
-  LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
   ConstantInt *CInt;
 
   if (match(V, m_ConstantInt(CInt))) {
@@ -244,23 +263,17 @@
   } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
     // TODO: we need to implement sadd16/sadd8 for this, which enables to
     // also do the rewrite for smlad8.ll, but it is unsupported for now.
-    LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
     return false;
   } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
-    if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
-      LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
-        cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
+    if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
       return false;
-    }
 
     if (match(Val, m_Load(m_Value()))) {
-      LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
       VL.push_back(Val);
       VL.push_back(I);
       return true;
     }
   }
-  LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
   return false;
 }
 
@@ -277,14 +290,6 @@
   if (!Ld0 || !Ld1)
     return false;
 
-  // Expect that the load is only used by one extending instruction.
-  if (!Ld0->hasOneUse() || !Ld1->hasOneUse())
-    return false;
-
-  // The extending instruction should also only have a single user.
-  if (!Ld0->user_back()->hasOneUse() || !Ld1->user_back()->hasOneUse())
-    return false;
-
   if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
     return false;
 
@@ -299,6 +304,17 @@
   return true;
 }
 
+static bool isProfitableToWiden(Instruction *I) {
+  switch(I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Load:
+    return I->hasOneUse() && isa<SExtInst>(I->user_back());
+  }
+  LLVM_DEBUG(dbgs() << "Not profitable to widen: " << *I << "\n");
+  return false;
+}
+
 /// Iterate through the block and record base, offset pairs of loads as well as
 /// maximal sequences of sequential loads.
 bool ARMParallelDSP::PrepareForBlock(BasicBlock *BB) {
@@ -308,7 +324,7 @@
     if (I.mayWriteToMemory())
       Writes.insert(&I);
     auto *Ld = dyn_cast<LoadInst>(&I);
-    if (!Ld || !Ld->isSimple())
+    if (!Ld || !Ld->isSimple() || !isa<IntegerType>(Ld->getType()))
       continue;
     Loads.insert(Ld);
   }
@@ -372,7 +388,8 @@
       if (Ld0 == Ld1)
         continue;
 
-      if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
+      if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE) &&
+          isProfitableToWiden(Ld0) && isProfitableToWiden(Ld1)) {
         LoadPairs[Ld0] = Ld1;
         break;
       }
@@ -414,12 +431,11 @@
       if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
         return false;
 
-      LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
-                 << "\t Ld0: " << *Ld0 << "\n"
-                 << "\t Ld1: " << *Ld1 << "\n"
-                 << "and operands " << x + 2 << ":\n"
-                 << "\t Ld2: " << *Ld2 << "\n"
-                 << "\t Ld3: " << *Ld3 << "\n");
+      LLVM_DEBUG(dbgs() << "Loads:\n"
+                 << " - " << *Ld0 << "\n"
+                 << " - " << *Ld1 << "\n"
+                 << " - " << *Ld2 << "\n"
+                 << " - " << *Ld3 << "\n");
 
       if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
         if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
@@ -468,11 +484,6 @@
 
       assert(PMul0 != PMul1 && "expected different chains");
 
-      LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
-                 dbgs() << "- "; Mul0->dump();
-                 dbgs() << "- "; Mul1->dump());
-
-      LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
       if (CanPair(PMul0, PMul1)) {
         Paired.insert(Mul0);
         Paired.insert(Mul1);
@@ -493,9 +504,8 @@
                dbgs() << "- "; PMul0->Root->dump();
                dbgs() << "- "; PMul1->Root->dump());
 
-    auto *VecLd0 = cast<LoadInst>(PMul0->VecLd[0]);
-    auto *VecLd1 = cast<LoadInst>(PMul1->VecLd[0]);
-    Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter);
+    Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
+                          InsertAfter);
     InsertAfter = Acc;
   }
 
@@ -551,14 +561,12 @@
 static void AddMACCandidate(OpChainList &Candidates,
                             Instruction *Mul,
                             Value *MulOp0, Value *MulOp1) {
-  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
   assert(Mul->getOpcode() == Instruction::Mul &&
          "expected mul instruction");
   ValueList LHS;
   ValueList RHS;
   if (IsNarrowSequence<16>(MulOp0, LHS) &&
       IsNarrowSequence<16>(MulOp1, RHS)) {
-    LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
     Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
   }
 }
@@ -566,7 +574,7 @@
 static void MatchParallelMACSequences(Reduction &R,
                                       OpChainList &Candidates) {
   Instruction *Acc = R.AccIntAdd;
-  LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc);
+  LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc << "\n");
 
   // Returns false to signal the search should be stopped.
   std::function<bool(Value*)> Match =
@@ -739,32 +747,81 @@
   return Changed;
 }
 
-static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad,
-                               Type *LoadTy) {
-  const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
-
-  Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
+LoadInst* ARMParallelDSP::CreateLoadIns(IRBuilder<NoFolder> &IRB,
+                                        SmallVectorImpl<LoadInst*> &Loads,
+                                        IntegerType *LoadTy) {
+  assert(Loads.size() == 2 && "currently only support widening two loads");
+ 
+  const unsigned AddrSpace = Loads[0]->getPointerAddressSpace();
+  Value *VecPtr = IRB.CreateBitCast(Loads[0]->getPointerOperand(),
                                     LoadTy->getPointerTo(AddrSpace));
-  return IRB.CreateAlignedLoad(LoadTy, VecPtr, BaseLoad.getAlignment());
+  LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
+                                             Loads[0]->getAlignment());
+  // Fix up users, Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
+  Instruction *SExt0 = dyn_cast<SExtInst>(Loads[0]->user_back());
+  Instruction *SExt1 = dyn_cast<SExtInst>(Loads[1]->user_back());
+
+  assert((Loads[0]->hasOneUse() && Loads[1]->hasOneUse() && SExt0 && SExt1) &&
+         "Loads should have a single, extending, user");
+
+  std::function<void(Instruction*, Instruction*)> MoveAfter =
+    [&](Instruction* Source, Instruction* Sink) -> void {
+    if (DT->dominates(Source, Sink) ||
+        Source->getParent() != Sink->getParent() ||
+        isa<PHINode>(Source) || isa<PHINode>(Sink))
+      return;
+
+    Sink->moveAfter(Source);
+    for (auto &U : Sink->uses())
+      MoveAfter(Sink, cast<Instruction>(U.getUser()));
+  };
+
+  // From the wide load, create two values that equal the original two loads.
+  Value *Bottom = IRB.CreateTrunc(WideLoad, Loads[0]->getType());
+  SExt0->setOperand(0, Bottom);
+  if (auto *I = dyn_cast<Instruction>(Bottom)) {
+    I->moveAfter(WideLoad);
+    MoveAfter(I, SExt0);
+  }
+
+  IntegerType *Ld1Ty = cast<IntegerType>(Loads[1]->getType());
+  Value *ShiftVal = ConstantInt::get(LoadTy, Ld1Ty->getBitWidth());
+  Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
+  if (auto *I = dyn_cast<Instruction>(Top))
+    MoveAfter(WideLoad, I);
+
+  Value *Trunc = IRB.CreateTrunc(Top, Ld1Ty);
+  SExt1->setOperand(0, Trunc);
+  if (auto *I = dyn_cast<Instruction>(Trunc))
+    MoveAfter(I, SExt1);
+
+  WideLoads.emplace(std::make_pair(Loads[0],
+                                   make_unique<WidenedLoad>(Loads, WideLoad)));
+  return WideLoad;
 }
 
-Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+Instruction *ARMParallelDSP::CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
+                                             SmallVectorImpl<LoadInst*> &VecLd1,
                                              Instruction *Acc, bool Exchange,
                                              Instruction *InsertAfter) {
   LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n"
-             << "- " << *VecLd0 << "\n"
-             << "- " << *VecLd1 << "\n"
+             << "- " << *VecLd0[0] << "\n"
+             << "- " << *VecLd0[1] << "\n"
+             << "- " << *VecLd1[0] << "\n"
+             << "- " << *VecLd1[1] << "\n"
              << "- " << *Acc << "\n"
-             << "Exchange: " << Exchange << "\n");
+             << "- Exchange: " << Exchange << "\n");
 
   IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
                               ++BasicBlock::iterator(InsertAfter));
 
   // Replace the reduction chain with an intrinsic call
-  Type *Ty = IntegerType::get(M->getContext(), 32);
-  LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
-  LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
-  Value* Args[] = { NewLd0, NewLd1, Acc };
+  IntegerType *Ty = IntegerType::get(M->getContext(), 32);
+  LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
+    WideLoads[VecLd0[0]]->getLoad() : CreateLoadIns(Builder, VecLd0, Ty);
+  LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
+    WideLoads[VecLd1[0]]->getLoad() : CreateLoadIns(Builder, VecLd1, Ty);
+  Value* Args[] = { WideLd0, WideLd1, Acc };
   Function *SMLAD = nullptr;
   if (Exchange)
     SMLAD = Acc->getType()->isIntegerTy(32) ?
@@ -792,7 +849,6 @@
     }
 
     const unsigned Pairs = VL0.size();
-    LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
 
     for (unsigned i = 0; i < Pairs; ++i) {
       const Value *V0 = VL0[i];
@@ -800,24 +856,17 @@
       const auto *Inst0 = dyn_cast<Instruction>(V0);
       const auto *Inst1 = dyn_cast<Instruction>(V1);
 
-      LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
-                dbgs() << "mul1: "; V0->dump();
-                dbgs() << "mul2: "; V1->dump());
-
       if (!Inst0 || !Inst1)
         return false;
 
-      if (Inst0->isSameOperationAs(Inst1)) {
-        LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
+      if (Inst0->isSameOperationAs(Inst1))
         continue;
-      }
 
       const APInt *C0, *C1;
       if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
         return false;
     }
 
-    LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
     return true;
   };
 
Index: test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
@@ -0,0 +1,251 @@
+; RUN: llc -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s | FileCheck %s
+
+; CHECK-LABEL: add_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxtah [[COUNT:r[0-9]+]], [[COUNT]], [[A]]
+define i32 @add_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %count.next = add i32 %conv4, %count
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: mul_bottom_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
+; CHECK: mul [[COUNT:r[0-9]+]], [[SXT]], [[COUNT]]
+define i32 @mul_bottom_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %count.next = mul i32 %conv4, %count
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: mul_top_user
+; CHECK: %for.body
+; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: asr.w [[ASR:[rl0-9]+]], [[ASR]], #16
+; CHECK: mul [[COUNT:[rl0-9]+]], [[ASR]], [[COUNT]]
+define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %count.next = mul i32 %conv7, %count
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: and_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: uxth [[UXT:r[0-9]+]], [[A]]
+; CHECK: mul [[MUL:r[0-9]+]], [[UXT]], [[MUL]]
+define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %bottom = and i32 %conv4, 65535
+  %mul = mul nsw i32 %conv, %conv4
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %count.next = mul i32 %bottom, %count
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: multi_uses
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]], [{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]], [{{.*}}, #2]!
+; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
+; CHECK: eor.w [[EOR:r[0-9]+]], [[SXT]], [[SHIFT:r[0-9]+]]
+; CHECK: mul [[MUL:r[0-9]+]], [[EOR]], [[SXT]]
+; CHECK: lsl.w [[SHIFT]], [[MUL]], #16
+define i32 @multi_uses(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %bottom = and i32 %conv4, 65535
+  %mul = mul nsw i32 %conv, %conv4
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %shl = shl i32 %conv4, 16
+  %add11 = add i32 %mul9, %add10
+  %xor = xor i32 %bottom, %count
+  %count.next = mul i32 %xor, %shl
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
Index: test/CodeGen/ARM/ParallelDSP/remove-duplicate-loads.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/remove-duplicate-loads.ll
+++ test/CodeGen/ARM/ParallelDSP/remove-duplicate-loads.ll
@@ -1,4 +1,6 @@
-; RUN: opt -mtriple=thumbv7em -arm-parallel-dsp -verify %s -S -o - | FileCheck %s
+; RUN: opt -mtriple=thumbv7em -arm-parallel-dsp -verify %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OPT
+; RUN: llc -mtriple=thumbv7em %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LLC
+; RUN: llc -mtriple=thumbv8m.main -mattr=+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LLC
 
 ; CHECK-LABEL: remove_duplicate_load
 define void @remove_duplicate_load(i32* %res, i16* %A, i32 %N) {
@@ -7,9 +9,9 @@
 
 for.body:
 ; CHECK: for.body
-; CHECK: load i16, i16* %ld.addr
-; CHECK: load i16, i16* %ld1.addr
-; CHECK-NOT: load
+; CHECK-OPT: load i16, i16* %ld.addr
+; CHECK-OPT: load i16, i16* %ld1.addr
+; CHECK-OPT-NOT: load
   %idx = phi i32 [ 0, %entry ], [ %idx.next, %for.body ]
   %acc = phi i32 [ 0, %entry ], [ %acc.next, %for.body ]
   %ld.addr = getelementptr inbounds i16, i16* %A, i32 %idx
@@ -42,7 +44,7 @@
 
 for.body:
 ; CHECK: for.body
-; CHECK: %sext.again = sext i16 %ld1.again to i32
+; CHECK-OPT: %sext.again = sext i16 %ld1.again to i32
   %idx = phi i32 [ 0, %entry ], [ %idx.next, %for.body ]
   %acc = phi i32 [ 0, %entry ], [ %acc.next, %for.body ]
   %ld.addr = getelementptr inbounds i16, i16* %A, i32 %idx
@@ -77,7 +79,7 @@
 
 for.body:
 ; CHECK: for.body
-; CHECK: %sext.again = sext i16 %ld1.again to i32
+; CHECK-OPT: %sext.again = sext i16 %ld1.again to i32
   %idx = phi i32 [ 0, %entry ], [ %idx.next, %for.body ]
   %acc = phi i32 [ 0, %entry ], [ %acc.next, %for.body ]
   %ld.addr = getelementptr inbounds i16, i16* %A, i32 %idx
@@ -110,7 +112,7 @@
 
 for.body:
 ; CHECK: for.body
-; CHECK: %sext.again = sext i16 %ld1.again to i32
+; CHECK-OPT: %sext.again = sext i16 %ld1.again to i32
   %idx = phi i32 [ 0, %entry ], [ %idx.next, %for.body ]
   %acc = phi i32 [ 0, %entry ], [ %acc.next, %for.body ]
   %ld.addr = getelementptr inbounds i16, i16* %A, i32 %idx
@@ -163,27 +165,36 @@
   br label %for.body
 
 ; CHECK: %for.body
-; CHECK: %A7 = load i16, i16* %arrayidx4.us.i.i, align 2
-; CHECK: %A8 = load i16, i16* %arrayidx5.us.i.i, align 2
-; CHECK: %A9 = load i16, i16* %arrayidx4.us.i.1.i, align 2
-; CHECK: %B1 = load i16, i16* %arrayidx4.us.i.2.i, align 2
-; CHECK: %B3 = load i16, i16* %arrayidx4.us.i.3.i, align 2
-; CHECK: %B5 = load i16, i16* %arrayidx4.us.i.1338.i, align 2
-; CHECK: %B6 = load i16, i16* %arrayidx5.us.i.1340.i, align 2
-; CHECK: %B7 = load i16, i16* %arrayidx4.us.i.1.1.i, align 2
-; CHECK: %B9 = load i16, i16* %arrayidx4.us.i.2.1.i, align 2
-; CHECK: %B11 = load i16, i16* %arrayidx4.us.i.3.1.i, align 2
-; CHECK: %B13 = load i16, i16* %arrayidx4.us.i.2347.i, align 2
-; CHECK: %B14 = load i16, i16* %arrayidx5.us.i.2349.i, align 2
-; CHECK: %B15 = load i16, i16* %arrayidx4.us.i.1.2.i, align 2
-; CHECK: %B17 = load i16, i16* %arrayidx4.us.i.2.2.i, align 2
-; CHECK: %B19 = load i16, i16* %arrayidx4.us.i.3.2.i, align 2
-; CHECK: %B21 = load i16, i16* %arrayidx4.us.i.3356.i, align 2
-; CHECK: %B22 = load i16, i16* %arrayidx5.us.i.3358.i, align 2
-; CHECK: %B23 = load i16, i16* %arrayidx4.us.i.1.3.i, align 2
-; CHECK: %B25 = load i16, i16* %arrayidx4.us.i.2.3.i, align 2
-; CHECK: %B27 = load i16, i16* %arrayidx4.us.i.3.3.i, align 2
-; CHECK-NOT: load
+; CHECK-OPT: %A7 = load i16, i16* %arrayidx4.us.i.i, align 2
+; CHECK-OPT: %A8 = load i16, i16* %arrayidx5.us.i.i, align 2
+; CHECK-OPT: %A9 = load i16, i16* %arrayidx4.us.i.1.i, align 2
+; CHECK-OPT: %B1 = load i16, i16* %arrayidx4.us.i.2.i, align 2
+; CHECK-OPT: %B3 = load i16, i16* %arrayidx4.us.i.3.i, align 2
+; CHECK-OPT: %B5 = load i16, i16* %arrayidx4.us.i.1338.i, align 2
+; CHECK-OPT: %B6 = load i16, i16* %arrayidx5.us.i.1340.i, align 2
+; CHECK-OPT: %B7 = load i16, i16* %arrayidx4.us.i.1.1.i, align 2
+; CHECK-OPT: %B9 = load i16, i16* %arrayidx4.us.i.2.1.i, align 2
+; CHECK-OPT: %B11 = load i16, i16* %arrayidx4.us.i.3.1.i, align 2
+; CHECK-OPT: %B13 = load i16, i16* %arrayidx4.us.i.2347.i, align 2
+; CHECK-OPT: %B14 = load i16, i16* %arrayidx5.us.i.2349.i, align 2
+; CHECK-OPT: %B15 = load i16, i16* %arrayidx4.us.i.1.2.i, align 2
+; CHECK-OPT: %B17 = load i16, i16* %arrayidx4.us.i.2.2.i, align 2
+; CHECK-OPT: %B19 = load i16, i16* %arrayidx4.us.i.3.2.i, align 2
+; CHECK-OPT: %B21 = load i16, i16* %arrayidx4.us.i.3356.i, align 2
+; CHECK-OPT: %B22 = load i16, i16* %arrayidx5.us.i.3358.i, align 2
+; CHECK-OPT: %B23 = load i16, i16* %arrayidx4.us.i.1.3.i, align 2
+; CHECK-OPT: %B25 = load i16, i16* %arrayidx4.us.i.2.3.i, align 2
+; CHECK-OPT: %B27 = load i16, i16* %arrayidx4.us.i.3.3.i, align 2
+; CHECK-OPT-NOT: load
+
+; CHECK-LLC: smlad
+; CHECK-LLC: smlad
+; CHECK-LLC: smlad
+; CHECK-LLC: smlad
+; CHECK-LLC: smlad
+; CHECK-LLC: smlad
+; CHECK-LLC: smlad
+; CHECK-LLC: smlad
 
 for.body:
   %A3 = phi i32 [ %add9.us.i.3361.i, %for.body ], [ 0, %entry ]
Index: test/CodeGen/ARM/ParallelDSP/smlad0.ll
===================================================================
--- test/CodeGen/ARM/ParallelDSP/smlad0.ll
+++ test/CodeGen/ARM/ParallelDSP/smlad0.ll
@@ -211,47 +211,3 @@
   br i1 %exitcond, label %for.body, label %for.cond.cleanup
 }
 
-define dso_local i32 @SextMultiUse(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
-; CHECK-LABEL: @SextMultiUse
-; CHECK-NOT: call i32 @llvm.arm.smlad
-; CHECK-UNSUPPORTED-NOT:  call i32 @llvm.arm.smlad
-entry:
-  %cmp24 = icmp sgt i32 %arg, 0
-  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %.pre = load i16, i16* %arg3, align 2
-  %.pre27 = load i16, i16* %arg2, align 2
-  br label %for.body
-
-for.cond.cleanup:
-  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
-  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
-  %res = add i32 %mac1.0.lcssa, %count.final
-  ret i32 %res
-
-for.body:
-  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
-  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
-  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
-  %0 = load i16, i16* %arrayidx, align 2
-  %add = add nuw nsw i32 %i.025, 1
-  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
-  %1 = load i16, i16* %arrayidx1, align 2
-  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
-  %2 = load i16, i16* %arrayidx3, align 2
-  %conv = sext i16 %2 to i32
-  %conv4 = sext i16 %0 to i32
-  %count.next = add i32 %conv4, %count
-  %mul = mul nsw i32 %conv, %conv4
-  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
-  %3 = load i16, i16* %arrayidx6, align 2
-  %conv7 = sext i16 %3 to i32
-  %conv8 = sext i16 %1 to i32
-  %mul9 = mul nsw i32 %conv7, %conv8
-  %add10 = add i32 %mul, %mac1.026
-  %add11 = add i32 %mul9, %add10
-  %exitcond = icmp ne i32 %add, %arg
-  br i1 %exitcond, label %for.body, label %for.cond.cleanup
-}