Index: llvm/lib/Target/AArch64/AArch64SchedA57.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -93,7 +93,8 @@
 def : SchedAlias<WriteFCvt,  A57Write_5cyc_1V>;
 def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>;
 def : SchedAlias<WriteFImm,  A57Write_3cyc_1V>;
-def : SchedAlias<WriteFMul,  A57Write_5cyc_1V>;
+// Replacing SchedAlias with WriteRes for advance lookup
+def : WriteRes<WriteFMul, [A57UnitV]> { let Latency = 5;}
 def : SchedAlias<WriteFDiv,  A57Write_17cyc_1W>;
 def : SchedAlias<WriteV,     A57Write_3cyc_1V>;
 def : SchedAlias<WriteVLD,   A57Write_5cyc_1L>;
@@ -350,12 +351,17 @@
 //   D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64
 //   Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64
 
+// Cortex A57 Software Optimization Guide Sec 3.14
+// Advance between absolute diff accum, pairwise add and accumulate, shift
+// accumulate, and various shift instructions
+def A57ReadIVA3 : SchedReadAdvance<3, [A57Write_3cyc_1X_NonMul_Forward, A57Write_4cyc_1X_NonMul_Forward, A57Write_4cyc_2X_NonMul_Forward, A57Write_5cyc_2X_NonMul_Forward]>;
+
 // ASIMD absolute diff accum, D-form
-def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
 // ASIMD absolute diff accum, Q-form
-def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+def : InstRW<[A57Write_5cyc_2X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
 // ASIMD absolute diff accum long
-def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABAL")>;
 
 // ASIMD arith, reduce, 4H/4S
 def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
@@ -372,46 +378,54 @@
 def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
 
 // ASIMD multiply, D-form
-def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
 // ASIMD multiply, Q-form
-def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[A57Write_6cyc_2W_Mul_Forward], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// Cortex A57 Software Optimization Guide Sec 3.14
+// Forwarding to multiply accumulate from other multiplies and mul-accums on the same pipeline
+def A57ReadIVMA4   : SchedReadAdvance<4 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>;
+def A57ReadIVMA3   : SchedReadAdvance<3 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>;
 
 // ASIMD multiply accumulate, D-form
-def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
 // ASIMD multiply accumulate, Q-form
-def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[A57Write_6cyc_2W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
 
 // ASIMD multiply accumulate long
 // ASIMD multiply accumulate saturating long
-def A57WriteIVMA   : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
-def A57ReadIVMA4   : SchedReadAdvance<4, [A57WriteIVMA]>;
-def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^(S|U)ML[AS]L")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA3], (instregex "^SQDML[AS]L")>;
 
 // ASIMD multiply long
-def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>;
-def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^PMULL(v8i8|v16i8)")>;
 def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>;
 
 // ASIMD pairwise add and accumulate
 // ASIMD shift accumulate
-def A57WriteIVA    : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
-def A57ReadIVA3    : SchedReadAdvance<3, [A57WriteIVA]>;
-def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>;
-def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ADALP")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
+
+
+// ASIMD shift by immed, basic
+def : InstRW<[A57Write_3cyc_1X_NonMul_Forward], (instregex "^(U|S)?SHLL", "^SHL", "^SLI", "^SRI", "^(U|S)SHR", "^(U|S)(SHR|XTL)", "^SHRN")>;
 
 // ASIMD shift by immed, complex
-def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>;
-def : InstRW<[A57Write_4cyc_1X], (instregex "^SQSHLU")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward], (instregex "^SQSHLU")>;
 
+// ASIMD shift by register, basic, D-form
+def : InstRW<[A57Write_3cyc_1X_NonMul_Forward], (instregex "^[SU]SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
 
 // ASIMD shift by register, basic, Q-form
-def : InstRW<[A57Write_4cyc_2X], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[A57Write_4cyc_2X_NonMul_Forward], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
 
 // ASIMD shift by register, complex, D-form
-def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
 
 // ASIMD shift by register, complex, Q-form
-def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[A57Write_5cyc_2X_NonMul_Forward], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
 
 
 // Vector - Floating Point
@@ -473,18 +487,24 @@
 // ASIMD FP max/min, reduce
 def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>;
 
+// Cortex A57 Software Optimization Guide Sec 3.15
+// Advances from FP mul and mul-accum to mul-accum
+
 // ASIMD FP multiply, D-form, FZ
-def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+def : InstRW<[A57Write_5cyc_1V_FP_Forward], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
 // ASIMD FP multiply, Q-form, FZ
-def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+def : InstRW<[A57Write_5cyc_2V_FP_Forward], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
 
 // ASIMD FP multiply accumulate, D-form, FZ
 // ASIMD FP multiply accumulate, Q-form, FZ
 def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
 def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10;  }
-def A57ReadFPVMA5  : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>;
+// D-form and Q-form have different advances
+def A57ReadFPVMA5  : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>;
+def A57ReadFPVMA6  : SchedReadAdvance<6, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>;
+
 def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
-def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA6], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
 
 // ASIMD FP round, D-form
 def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>;
@@ -547,8 +567,9 @@
 
 def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>;
 
+// Cortex A57 Software Optimization Guide Sec 3.10
 def A57WriteFPMA  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
-def A57ReadFPMA5  : SchedReadAdvance<5, [A57WriteFPMA]>;
+def A57ReadFPMA5  : SchedReadAdvance<5, [A57WriteFPMA, WriteFMul]>;
 def A57ReadFPM    : SchedReadAdvance<0>;
 def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
 
Index: llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
+++ llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
@@ -13,6 +13,10 @@
 //   Prefix: A57Write
 //   Latency: #cyc
 //   MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
+//   Postfix (optional): (XYZ)_Forward
+//
+//   The postfix is added to differentiate SchedWriteRes that are used in
+//   subsequent SchedReadAdvances.
 //
 // e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
 //      11 micro-ops to be issued down one I pipe, six S pipes and four V pipes.
@@ -25,7 +29,9 @@
 def A57Write_5cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 5;  }
 def A57Write_5cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 5;  }
 def A57Write_5cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
+def A57Write_5cyc_1V_FP_Forward  : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
 def A57Write_5cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
+def A57Write_5cyc_1W_Mul_Forward  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
 def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
 def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17;
                                                     let ResourceCycles = [17]; }
@@ -43,8 +49,11 @@
 def A57Write_3cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 3;  }
 def A57Write_3cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 3;  }
 def A57Write_3cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 3;  }
+def A57Write_3cyc_1X_NonMul_Forward  : SchedWriteRes<[A57UnitX]> { let Latency = 3;  }
 def A57Write_4cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 4;  }
+def A57Write_4cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 4;  }
 def A57Write_4cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57Write_4cyc_1X_NonMul_Forward  : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
 def A57Write_9cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
 def A57Write_6cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 6;  }
 def A57Write_6cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 6;  }
@@ -93,6 +102,10 @@
   let Latency     = 6;
   let NumMicroOps = 2;
 }
+def A57Write_6cyc_2W_Mul_Forward     : SchedWriteRes<[A57UnitW, A57UnitW]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
 def A57Write_5cyc_1I_1L  : SchedWriteRes<[A57UnitI,
                                           A57UnitL]> {
   let Latency     = 5;
@@ -102,10 +115,18 @@
   let Latency     = 5;
   let NumMicroOps = 2;
 }
+def A57Write_5cyc_2V_FP_Forward     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
 def A57Write_5cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
   let Latency     = 5;
   let NumMicroOps = 2;
 }
+def A57Write_5cyc_2X_NonMul_Forward     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
 def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
                                           A57UnitV]> {
   let Latency     = 10;
@@ -171,6 +192,10 @@
   let Latency     = 4;
   let NumMicroOps = 2;
 }
+def A57Write_4cyc_2X_NonMul_Forward     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
 
 
 //===----------------------------------------------------------------------===//
Index: llvm/test/CodeGen/AArch64/aarch64-misched-forwarding-A57.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/aarch64-misched-forwarding-A57.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -enable-misched -enable-post-misched=false -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+
+define <2 x i32> @mlamlaD(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) {
+; CHECK: ********** MI Scheduling **********
+; CHECK-NEXT: mlamlaD:%bb.0
+; CHECK: MLAv2i32
+; CHECK: Latency            : 5
+; CHECK: Successors
+; CHECK-NEXT: SU(6): Data Latency=1
+; CHECK: MLAv2i32
+; CHECK: Latency            : 5
+; CHECK: Successors
+; CHECK-NEXT: SU(7): Data Latency=5
+	%tmp0 = mul <2 x i32> %A, %B;
+	%tmp1 = add <2 x i32> %E, %tmp0;
+	%tmp2 = mul <2 x i32> %C, %D;
+	%tmp3 = add <2 x i32> %tmp1, %tmp2;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @mlamlaQ(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) {
+; CHECK: ********** MI Scheduling **********
+; CHECK-NEXT: mlamlaQ:%bb.0
+; CHECK: MLAv4i32
+; CHECK: Latency            : 6
+; CHECK: Successors
+; CHECK-NEXT: SU(6): Data Latency=2
+; CHECK: MLAv4i32
+; CHECK: Latency            : 6
+; CHECK: Successors
+; CHECK-NEXT: SU(7): Data Latency=6
+	%tmp0 = mul <4 x i32> %A, %B;
+	%tmp1 = add <4 x i32> %E, %tmp0;
+	%tmp2 = mul <4 x i32> %C, %D;
+	%tmp3 = add <4 x i32> %tmp1, %tmp2;
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i32> @mulmlaD(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
+; CHECK: ********** MI Scheduling **********
+; CHECK-NEXT: mulmlaD:%bb.0
+; CHECK: MULv2i32
+; CHECK: Latency            : 5
+; CHECK: Successors
+; CHECK-NEXT: SU(5): Data Latency=1
+
+; CHECK: MLAv2i32
+; CHECK: Latency            : 5
+; CHECK: Successors
+; CHECK-NEXT: SU(6): Data Latency=5
+	%tmp0 = mul <2 x i32> %A, %B;
+	%tmp1 = mul <2 x i32> %C, %D;
+	%tmp2 = add <2 x i32> %tmp0, %tmp1;
+	ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @mulmlaQ(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) {
+; CHECK: ********** MI Scheduling **********
+; CHECK-NEXT: mulmlaQ:%bb.0
+; CHECK: MULv4i32
+; CHECK: Latency            : 6
+; CHECK: Successors
+; CHECK-NEXT: SU(5): Data Latency=2
+
+; CHECK: MLAv4i32
+; CHECK: Latency            : 6
+; CHECK: Successors
+; CHECK-NEXT: SU(6): Data Latency=6
+	%tmp0 = mul <4 x i32> %A, %B;
+	%tmp1 = mul <4 x i32> %C, %D;
+	%tmp2 = add <4 x i32> %tmp0, %tmp1;
+	ret <4 x i32> %tmp2
+}
\ No newline at end of file