diff --git a/llvm/lib/Target/ARM/ARMScheduleA57.td b/llvm/lib/Target/ARM/ARMScheduleA57.td
--- a/llvm/lib/Target/ARM/ARMScheduleA57.td
+++ b/llvm/lib/Target/ARM/ARMScheduleA57.td
@@ -183,11 +183,6 @@
 // TODO: according to the doc, conditional uses I0/I1, unconditional uses M
 // Why more complex instruction uses more simple pipeline?
 // May be an error in doc.
-def A57WriteALUsi : SchedWriteVariant<[
-  // lsl #2, lsl #1, or lsr #1.
-  SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]>,
-  SchedVar<NoSchedPred,      [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]>
-]>;
 def A57WriteALUsr : SchedWriteVariant<[
   SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1I>>]>,
   SchedVar<NoSchedPred,      [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]>
@@ -200,7 +195,7 @@
   SchedVar<IsPredicatedPred, [ReadDefault]>,
   SchedVar<NoSchedPred,      [ReadDefault]>
 ]>;
-def : SchedAlias<WriteALUsi,  A57WriteALUsi>;
+def : SchedAlias<WriteALUsi,  CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>>;
 def : SchedAlias<WriteALUsr,  A57WriteALUsr>;
 def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
 def : SchedAlias<ReadALUsr,   A57ReadALUsr>;
diff --git a/llvm/test/CodeGen/ARM/cortex-a57-misched-mla.mir b/llvm/test/CodeGen/ARM/cortex-a57-misched-mla.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/cortex-a57-misched-mla.mir
@@ -0,0 +1,34 @@
+# RUN: llc -mcpu=cortex-a57 -mtriple=thumb -enable-misched -run-pass=machine-scheduler -debug-only=machine-scheduler %s 2>&1 | FileCheck %s
+
+# CHECK-LABEL: ********** MI Scheduling **********
+# CHECK:       %[[RES:[0-9]+]]:rgpr = t2MLA
+# CHECK-NEXT:  # preds left
+# CHECK-NEXT:  # succs left
+# CHECK-NEXT:  # rdefs left
+# CHECK-NEXT:  Latency : 3
+# CHECK-NEXT:  Depth
+# CHECK-NEXT:  Height
+# CHECK-NEXT:  Predecessors:
+# CHECK-NEXT:    SU({{.*}}): Data Latency=1 Reg=
+# CHECK-NEXT:    SU({{.*}}): Out  Latency=
+# CHECK-NEXT:    SU({{.*}}): Data Latency=1 Reg=
+# CHECK-NEXT:  Successors:
+# CHECK-NEXT:    SU([[SMLA_SU:[0-9]+]]): Data Latency=1 Reg=%[[RES]]
+# CHECK-NEXT:  Pressure Diff
+# CHECK-NEXT:  Single Issue : false;
+# CHECK-NEXT:  SU([[SMLA_SU]]): {{.*}} = t2SMLAL %{{[0-9]+}}:rgpr, %{{[0-9]+}}:rgpr, %{{[0-9]+}}:rgpr(tied-def 0), %[[RES]]:rgpr(tied-def 1), 14, $noreg
+
+name:            test_smlal_forwarding
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r1, $r3, $r4, $r5, $r6
+    %1:rgpr = COPY $r1
+    %3:rgpr = COPY $r3
+    %4:rgpr = COPY $r4
+    %5:rgpr = COPY $r5
+    %6:rgpr = COPY $r6
+    %3:rgpr = t2MLA %4:rgpr, %1:rgpr, %4:rgpr, 14, $noreg
+    %6:rgpr, %5:rgpr = t2SMLAL %5:rgpr, %6:rgpr, %4:rgpr, %3:rgpr, 14, $noreg
+    $r0 = COPY %6:rgpr
+    BX_RET 14, $noreg, implicit $r0
diff --git a/llvm/utils/TableGen/CodeGenSchedule.h b/llvm/utils/TableGen/CodeGenSchedule.h
--- a/llvm/utils/TableGen/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/CodeGenSchedule.h
@@ -443,6 +443,7 @@
   InstClassMapTy InstrClassMap;
 
   std::vector<STIPredicateFunction> STIPredicates;
+  std::vector<unsigned> getAllProcIndices() const;
 
 public:
   CodeGenSchedModels(RecordKeeper& RK, const CodeGenTarget &TGT);
diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp
--- a/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -1338,8 +1338,7 @@
   PredTransitions(CodeGenSchedModels &sm): SchedModels(sm) {}
 
   bool substituteVariantOperand(const SmallVectorImpl<unsigned> &RWSeq,
-                                bool IsRead, bool IsForAnyCPU,
-                                unsigned StartIdx);
+                                bool IsRead, unsigned StartIdx);
 
   bool substituteVariants(const PredTransition &Trans);
 
@@ -1413,29 +1412,6 @@
   return false;
 }
 
-static bool hasAliasedVariants(const CodeGenSchedRW &RW,
-                               CodeGenSchedModels &SchedModels) {
-  if (RW.HasVariants)
-    return true;
-
-  for (Record *Alias : RW.Aliases) {
-    const CodeGenSchedRW &AliasRW =
-      SchedModels.getSchedRW(Alias->getValueAsDef("AliasRW"));
-    if (AliasRW.HasVariants)
-      return true;
-    if (AliasRW.IsSequence) {
-      IdxVec ExpandedRWs;
-      SchedModels.expandRWSequence(AliasRW.Index, ExpandedRWs, AliasRW.IsRead);
-      for (unsigned SI : ExpandedRWs) {
-        if (hasAliasedVariants(SchedModels.getSchedRW(SI, AliasRW.IsRead),
-                               SchedModels))
-          return true;
-      }
-    }
-  }
-  return false;
-}
-
 static std::vector<Record *> getAllPredicates(ArrayRef<TransVariant> Variants,
                                               ArrayRef<unsigned> ProcIndices) {
   std::vector<Record *> Preds;
@@ -1613,21 +1589,7 @@
 // starts. RWSeq must be applied to all transitions between StartIdx and the end
 // of TransVec.
 bool PredTransitions::substituteVariantOperand(
-    const SmallVectorImpl<unsigned> &RWSeq, bool IsRead, bool IsForAnyCPU,
-    unsigned StartIdx) {
-
-  auto CollectAndAddVariants = [&](unsigned TransIdx,
-                                   const CodeGenSchedRW &SchedRW) {
-    // Distribute this partial PredTransition across intersecting variants.
-    // This will push a copies of TransVec[TransIdx] on the back of TransVec.
-    std::vector<TransVariant> IntersectingVariants;
-    getIntersectingVariants(SchedRW, TransIdx, IntersectingVariants);
-    // Now expand each variant on top of its copy of the transition.
-    for (const TransVariant &IV : IntersectingVariants)
-      pushVariant(IV, IsRead);
-    return !IntersectingVariants.empty();
-  };
-
+    const SmallVectorImpl<unsigned> &RWSeq, bool IsRead, unsigned StartIdx) {
   bool Subst = false;
   // Visit each original RW within the current sequence.
   for (SmallVectorImpl<unsigned>::const_iterator
@@ -1636,35 +1598,24 @@
     // Push this RW on all partial PredTransitions or distribute variants.
     // New PredTransitions may be pushed within this loop which should not be
     // revisited (TransEnd must be loop invariant).
-    bool HasAliases = false, WasPushed = false;
     for (unsigned TransIdx = StartIdx, TransEnd = TransVec.size();
          TransIdx != TransEnd; ++TransIdx) {
-      // In the common case, push RW onto the current operand's sequence.
-      if (!hasAliasedVariants(SchedRW, SchedModels)) {
+      // Distribute this partial PredTransition across intersecting variants.
+      // This will push a copies of TransVec[TransIdx] on the back of TransVec.
+      std::vector<TransVariant> IntersectingVariants;
+      getIntersectingVariants(SchedRW, TransIdx, IntersectingVariants);
+      // Now expand each variant on top of its copy of the transition.
+      for (const TransVariant &IV : IntersectingVariants)
+        pushVariant(IV, IsRead);
+      if (IntersectingVariants.empty()) {
         if (IsRead)
           TransVec[TransIdx].ReadSequences.back().push_back(*RWI);
         else
           TransVec[TransIdx].WriteSequences.back().push_back(*RWI);
         continue;
+      } else {
+        Subst = true;
       }
-      HasAliases = true;
-      WasPushed |= CollectAndAddVariants(TransIdx, SchedRW);
-      Subst |= WasPushed;
-    }
-    if (IsRead && IsForAnyCPU && HasAliases && !WasPushed) {
-      // If we're here this means that in some sched class:
-      // a) We have read variant for CPU A
-      // b) We have write variant for CPU B
-      // b) We don't have write variant for CPU A
-      // d) We must expand all read/write variants (IsForAnyCPU is true)
-      // e) We couldn't expand SchedRW because TransVec doesn't have
-      //    any transition with compatible CPU ID.
-      // In such case we create new empty transition with zero (AnyCPU)
-      // index.
-      TransVec.reserve(TransVec.size() + 1);
-      TransVec.emplace_back(TransVec[StartIdx].PredTerm);
-      TransVec.back().ReadSequences.emplace_back();
-      Subst |= CollectAndAddVariants(TransVec.size() - 1, SchedRW);
     }
   }
   return Subst;
@@ -1683,7 +1634,7 @@
   bool Subst = false;
   TransVec.emplace_back(Trans.PredTerm, Trans.ProcIndices);
 
-  bool IsForAnyCPU = llvm::count(Trans.ProcIndices, 0);
+  assert(!llvm::count(Trans.ProcIndices, 0));
   // Visit each original write sequence.
   for (SmallVectorImpl<SmallVector<unsigned,4>>::const_iterator
          WSI = Trans.WriteSequences.begin(), WSE = Trans.WriteSequences.end();
@@ -1693,8 +1644,7 @@
            TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
       I->WriteSequences.emplace_back();
     }
-    Subst |=
-        substituteVariantOperand(*WSI, /*IsRead=*/false, IsForAnyCPU, StartIdx);
+    Subst |= substituteVariantOperand(*WSI, /*IsRead=*/false, StartIdx);
   }
   // Visit each original read sequence.
   for (SmallVectorImpl<SmallVector<unsigned,4>>::const_iterator
@@ -1705,8 +1655,7 @@
            TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
       I->ReadSequences.emplace_back();
     }
-    Subst |=
-        substituteVariantOperand(*RSI, /*IsRead=*/true, IsForAnyCPU, StartIdx);
+    Subst |= substituteVariantOperand(*RSI, /*IsRead=*/true, StartIdx);
   }
   return Subst;
 }
@@ -1745,6 +1694,10 @@
   // requires creating a new SchedClass.
   for (ArrayRef<PredTransition>::iterator
          I = LastTransitions.begin(), E = LastTransitions.end(); I != E; ++I) {
+    // Variant expansion (substituteVariants) may create unconditional
+    // transitions. We don't need to build sched classes for them.
+    if (I->PredTerm.empty())
+      continue;
     IdxVec OperWritesVariant, OperReadsVariant;
     addSequences(SchedModels, I->WriteSequences, OperWritesVariant, false);
     addSequences(SchedModels, I->ReadSequences, OperReadsVariant, true);
@@ -1777,6 +1730,26 @@
   }
 }
 
+std::vector<unsigned> CodeGenSchedModels::getAllProcIndices() const {
+  std::vector<unsigned> ProcIdVec;
+  for (const auto &PM : ProcModelMap)
+    if (PM.second != 0)
+      ProcIdVec.push_back(PM.second);
+  return ProcIdVec;
+}
+
+static std::vector<PredTransition>
+makePerProcessorTransitions(const PredTransition &Trans,
+                            ArrayRef<unsigned> ProcIndices) {
+  std::vector<PredTransition> PerCpuTransVec;
+  for (unsigned ProcId : ProcIndices) {
+    assert(ProcId != 0);
+    PerCpuTransVec.push_back(Trans);
+    PerCpuTransVec.back().ProcIndices.assign(1, ProcId);
+  }
+  return PerCpuTransVec;
+}
+
 // Create new SchedClasses for the given ReadWrite list. If any of the
 // ReadWrites refers to a SchedVariant, create a new SchedClass for each variant
 // of the ReadWrite list, following Aliases if necessary.
@@ -1812,6 +1785,10 @@
   }
   LLVM_DEBUG(dbgs() << '\n');
 
+  LastTransitions = makePerProcessorTransitions(
+      LastTransitions[0], llvm::count(ProcIndices, 0)
+                              ? ArrayRef<unsigned>(getAllProcIndices())
+                              : ProcIndices);
   // Collect all PredTransitions for individual operands.
   // Iterate until no variant writes remain.
   bool SubstitutedAny;
@@ -1823,9 +1800,6 @@
     LLVM_DEBUG(Transitions.dump());
     LastTransitions.swap(Transitions.TransVec);
   } while (SubstitutedAny);
-  // If the first transition has no variants, nothing to do.
-  if (LastTransitions[0].PredTerm.empty())
-    return;
 
   // WARNING: We are about to mutate the SchedClasses vector. Do not refer to
   // OperWrites, OperReads, or ProcIndices after calling inferFromTransitions.