Index: lib/Target/ARM/ARMISelDAGToDAG.cpp
===================================================================
--- lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -286,6 +286,8 @@
   /// Replace N with M in CurDAG, in a way that also ensures that M gets
   /// selected when N would have been selected.
   void replaceDAGValue(const SDValue &N, SDValue M);
+
+  bool canUseVMLxForwarding(const SDNode &N) const;
 };
 }
 
@@ -418,9 +420,54 @@
   }
 }
 
+/// Checks if VMLS/VMLA are used instead of N they can be issued back to back
+/// to the previous instruction. E.g., Cortex-A9 NEON MPE uses a special multiplier
+/// accumulator forwarding if a multiply-accumulate follows a multiply or
+/// another multiply-accumulate, and depends on the result of that first
+/// instruction, and the dependency between both instructions are of the
+/// same type and size.
+bool ARMDAGToDAGISel::canUseVMLxForwarding(const SDNode& N) const {
+  if (!Subtarget->hasVMLxForwarding())
+    return false;
+
+  switch (N.getOpcode()) {
+  case ISD::FMA:
+    break;
+
+  case ISD::FSUB:
+  case ISD::FADD: {
+    if (N.getOperand(1).getOpcode() != ISD::FMUL)
+      return false;
+    break;
+  }
+
+  default:
+    return false;
+  }
+
+  const SDValue &Acc = N.getOperand(0);
+  if (Acc.getOpcode() != ISD::FMUL && Acc.getOpcode() != ISD::FMA)
+    return false;
+
+  if (N.getNumValues() != 1)
+    return false;
+
+  if (Acc.getValueType() != N.getValueType(0))
+    return false;
+
+  if (!Acc.getValueType().isSimple())
+    return false;
+
+  if (!Acc.getValueType().isFloatingPoint()
+      || Acc.getValueType().isVector())
+    return false;
+
+  return true;
+}
+
 /// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
 /// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
-/// least on current ARM implementations) which should be avoidded.
+/// least on current ARM implementations) which should be avoided.
 bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (OptLevel == CodeGenOpt::None)
     return true;
@@ -428,6 +475,9 @@
   if (!Subtarget->hasVMLxHazards())
     return true;
 
+  if (canUseVMLxForwarding(*N))
+    return true;
+
   if (!N->hasOneUse())
     return false;
 
Index: lib/Target/ARM/MLxExpansionPass.cpp
===================================================================
--- lib/Target/ARM/MLxExpansionPass.cpp
+++ lib/Target/ARM/MLxExpansionPass.cpp
@@ -57,13 +57,16 @@
     unsigned MIIdx;
     MachineInstr* LastMIs[4];
     SmallPtrSet<MachineInstr*, 4> IgnoreStall;
+    SmallPtrSet<MachineInstr*, 4> AccForwarding;
 
     void clearStack();
     void pushStack(MachineInstr *MI);
     MachineInstr *getAccDefMI(MachineInstr *MI) const;
     unsigned getDefReg(MachineInstr *MI) const;
     bool hasLoopHazard(MachineInstr *MI) const;
-    bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
+    bool hasRAWHazard(MachineInstr *MI, MachineInstr *NextMI) const;
+    bool canUseVMLxForwarding(MachineInstr *MI,
+      MachineInstr *AccDef) const;
     bool FindMLxHazard(MachineInstr *MI);
     void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
                                 unsigned MulOpc, unsigned AddSubOpc,
@@ -182,17 +185,21 @@
   return DefMI == MI;
 }
 
-bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
+bool MLxExpansion::hasRAWHazard(MachineInstr *MI, MachineInstr *NextMI) const {
+  unsigned Reg = getDefReg(MI);
   // FIXME: Detect integer instructions properly.
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = NextMI->getDesc();
   unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
-  if (MI->mayStore())
+  if (NextMI->mayStore())
     return false;
   unsigned Opcode = MCID.getOpcode();
   if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
     return false;
-  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
-    return MI->readsRegister(Reg, TRI);
+  if (Domain & ARMII::DomainNEON)
+    return NextMI->readsRegister(Reg, TRI);
+  else if (Domain & ARMII::DomainVFP)
+    return NextMI->readsRegister(Reg, TRI) && !AccForwarding.count(MI);
+
   return false;
 }
 
@@ -210,6 +217,49 @@
   }
 }
 
+/// Checks VMLS/VMLA can be issued back to back to the previous instruction.
+/// E.g., Cortex-A9 NEON MPE uses a special multiplier
+/// accumulator forwarding if a multiply-accumulate follows a multiply or
+/// another multiply-accumulate, and depends on the result of that first
+/// instruction, and the dependency between both instructions are of the
+/// same type and size.
+bool MLxExpansion::canUseVMLxForwarding(MachineInstr* MI,
+    MachineInstr *AccDef) const {
+  assert(MI);
+  assert(AccDef);
+  assert(TII->isFpMLxInstruction(MI->getOpcode()));
+  assert(MI->getParent());
+  assert(MI->getParent()->getParent());
+
+  if (!MI->getParent()->getParent()
+      ->getSubtarget<ARMSubtarget>().hasVMLxForwarding())
+    return false;
+
+  const unsigned AccDefOpcode = AccDef->getOpcode();
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+
+  case ARM::VMLAS:
+  case ARM::VMLSS:
+    if (AccDefOpcode != ARM::VMLAS && AccDefOpcode != ARM::VMLSS &&
+        AccDefOpcode != ARM::VMULS) {
+      return false;
+    }
+    break;
+
+  case ARM::VMLAD:
+  case ARM::VMLSD:
+    if (AccDefOpcode != ARM::VMLAD && AccDefOpcode != ARM::VMLSD &&
+        AccDefOpcode != ARM::VMULD) {
+      return false;
+    }
+    break;
+  }
+
+  return true;
+}
+
 bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
   if (NumExpand >= ExpandLimit)
     return false;
@@ -218,6 +268,12 @@
     return true;
 
   MachineInstr *DefMI = getAccDefMI(MI);
+
+  if (canUseVMLxForwarding(MI, DefMI)) {
+    AccForwarding.insert(DefMI);
+    return false;
+  }
+
   if (TII->isFpMLxInstruction(DefMI->getOpcode())) {
     // r0 = vmla
     // r3 = vmla r0, r1, r2
@@ -259,7 +315,7 @@
     }
 
     // Look for VMLx RAW hazard.
-    if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI))
+    if (i <= Limit2 && hasRAWHazard(MI, NextMI))
       return true;
   }
 
@@ -330,6 +386,7 @@
 
   clearStack();
   IgnoreStall.clear();
+  AccForwarding.clear();
 
   unsigned Skip = 0;
   MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
Index: test/CodeGen/ARM/fmacs.ll
===================================================================
--- test/CodeGen/ARM/fmacs.ll
+++ test/CodeGen/ARM/fmacs.ll
@@ -89,13 +89,11 @@
 
 ; A9-LABEL: t5:
 ; A9: vmla.f32
-; A9: vmul.f32
-; A9: vadd.f32
+; A9: vmla.f32
 
 ; HARD-LABEL: t5:
 ; HARD: vmla.f32 s4, s0, s1
-; HARD: vmul.f32 s0, s2, s3
-; HARD: vadd.f32 s0, s4, s0
+; HARD: vmla.f32 s4, s2, s3
   %0 = fmul float %a, %b
   %1 = fadd float %e, %0
   %2 = fmul float %c, %d
Index: test/CodeGen/ARM/fml.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/fml.ll
@@ -0,0 +1,64 @@
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=FPSCALAR
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=FPVECTOR
+
+define double @test1(double %a, double %b, double %c, double %d, double %e, double %f) {
+  %1 = fmul double %a, %c
+  %2 = fmul double %b, %d
+  %3 = fsub double %1, %2
+; FPSCALAR: vmls.f64 {{.*}}
+
+  %4 = fmul double %a, %d
+  %5 = fmul double %b, %c
+  %6 = fadd double %5, %4
+; FPSCALAR: vmla.f64 {{.*}}
+
+  %7 = fsub double %e, %3
+  %8 = fsub double %f, %6
+  %9 = fadd double %3, %8
+  %10 = fadd double %6, %7
+  %11 = fmul double %9, %10
+
+  ret double %11
+}
+
+define float @test2(float %a, float %b, float %c, float %d, float %e, float %f) {
+  %1 = fmul float %a, %c
+  %2 = fmul float %b, %d
+  %3 = fsub float %1, %2
+; FPSCALAR: vmls.f32 {{.*}}
+
+  %4 = fmul float %a, %d
+  %5 = fmul float %b, %c
+  %6 = fadd float %5, %4
+; FPSCALAR: vmla.f32 {{.*}}
+
+  %7 = fsub float %e, %3
+  %8 = fsub float %f, %6
+  %9 = fadd float %3, %8
+  %10 = fadd float %6, %7
+  %11 = fmul float %9, %10
+
+  ret float %11
+}
+
+define <2 x float> @test3(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, <2 x float> %e, <2 x float> %f) {
+; FPVECTOR-LABEL: test3
+  %1 = fmul <2 x float> %a, %c
+  %2 = fmul <2 x float> %b, %d
+  %3 = fsub <2 x float> %1, %2
+; FPVECTOR-NOT: vmls{{.*}}
+
+  %4 = fmul <2 x float> %a, %d
+  %5 = fmul <2 x float> %b, %c
+  %6 = fadd <2 x float> %5, %4
+; FPVECTOR-NOT: vmla{{.*}}
+
+  %7 = fsub <2 x float> %e, %3
+  %8 = fsub <2 x float> %f, %6
+  %9 = fadd <2 x float> %3, %8
+  %10 = fadd <2 x float> %6, %7
+  %11 = fmul <2 x float> %9, %10
+
+  ret <2 x float> %11
+}
+