Index: lib/Target/ARM64/ARM64InstrFormats.td
===================================================================
--- lib/Target/ARM64/ARM64InstrFormats.td
+++ lib/Target/ARM64/ARM64InstrFormats.td
@@ -1038,7 +1038,7 @@
                          SDPatternOperator node>
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
       [(set regtype:$Rd, (node regtype:$Rn))]>,
-    Sched<[WriteI]> {
+    Sched<[WriteI, ReadI]> {
   bits<5> Rd;
   bits<5> Rn;
 
@@ -1077,7 +1077,7 @@
                           list<dag> pattern>
     : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
         asm, "\t$Rd, $Rn, $Rm", "", pattern>,
-      Sched<[WriteI]> {
+      Sched<[WriteI, ReadI, ReadI]> {
   let Uses = [NZCV];
   bits<5> Rd;
   bits<5> Rn;
@@ -1151,11 +1151,11 @@
 
 multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
   def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
-           Sched<[WriteID32]> {
+           Sched<[WriteID32, ReadID, ReadID]> {
     let Inst{31} = 0;
   }
   def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
-           Sched<[WriteID64]> {
+           Sched<[WriteID64, ReadID, ReadID]> {
     let Inst{31} = 1;
   }
 }
@@ -1163,7 +1163,7 @@
 class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
                 SDPatternOperator OpNode = null_frag>
   : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
-    Sched<[WriteIS]> {
+    Sched<[WriteIS, ReadI]> {
   let Inst{11-10} = shift_type;
 }
 
@@ -1215,13 +1215,13 @@
 multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
   def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
       [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
-      Sched<[WriteIM32]> {
+      Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
     let Inst{31} = 0;
   }
 
   def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
       [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
-      Sched<[WriteIM64]> {
+      Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> {
     let Inst{31} = 1;
   }
 }
@@ -1231,7 +1231,7 @@
   : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
     [(set GPR64:$Rd, (AccNode GPR64:$Ra,
                             (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
-    Sched<[WriteIM32]> {
+    Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
   let Inst{31} = 1;
 }
 
@@ -1239,7 +1239,7 @@
   : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
       asm, "\t$Rd, $Rn, $Rm", "",
       [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
-    Sched<[WriteIM64]> {
+    Sched<[WriteIM64, ReadIM, ReadIM]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -1270,7 +1270,7 @@
   : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
       asm, "\t$Rd, $Rn, $Rm", "",
       [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
-    Sched<[WriteISReg]> {
+    Sched<[WriteISReg, ReadI, ReadISReg]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -1357,7 +1357,7 @@
   : I<(outs regtype:$Rd),
       (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
        asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
-    Sched<[WriteI]> {
+    Sched<[WriteI, ReadI]> {
   bits<5> Rd;
   bits<16> imm;
   bits<6> shift;
@@ -1390,7 +1390,7 @@
     : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
         asm, "\t$Rd, $Rn, $imm", "",
         [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
-      Sched<[WriteI]>  {
+      Sched<[WriteI, ReadI]>  {
   bits<5>  Rd;
   bits<5>  Rn;
   bits<14> imm;
@@ -1408,7 +1408,7 @@
                           SDPatternOperator OpNode>
     : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
              [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
-      Sched<[WriteI]>;
+      Sched<[WriteI, ReadI, ReadI]>;
 
 class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
                      arith_shifted_reg shifted_regtype, string asm,
@@ -1416,7 +1416,7 @@
     : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
         asm, "\t$Rd, $Rn, $Rm", "",
         [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
-      Sched<[WriteISReg]> {
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
   // The operands are in order to match the 'addr' MI operands, so we
   // don't need an encoder method and by-name matching. Just use the default
   // in-order handling. Since we're using by-order, make sure the names
@@ -1445,7 +1445,7 @@
         (ins src1Regtype:$R2, src2Regtype:$R3),
         asm, "\t$R1, $R2, $R3", "",
         [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
-      Sched<[WriteIEReg]> {
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -1470,7 +1470,7 @@
     : I<(outs dstRegtype:$Rd),
         (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
         asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
-      Sched<[WriteIEReg]> {
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -1695,7 +1695,7 @@
                       RegisterClass regtype, Operand imm_type, string asm>
     : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
          asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
-      Sched<[WriteIS]> {
+      Sched<[WriteIS, ReadI]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<6> immr;
@@ -1729,7 +1729,7 @@
     : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
                              imm_type:$imms),
          asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
-      Sched<[WriteIS]> {
+      Sched<[WriteIS, ReadI]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<6> immr;
@@ -1767,7 +1767,7 @@
                      list<dag> pattern>
     : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
          asm, "\t$Rd, $Rn, $imm", "", pattern>,
-      Sched<[WriteI]> {
+      Sched<[WriteI, ReadI]> {
   bits<5>  Rd;
   bits<5>  Rn;
   bits<13> imm;
@@ -1788,7 +1788,7 @@
                       list<dag> pattern>
     : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
         asm, "\t$Rd, $Rn, $Rm", "", pattern>,
-      Sched<[WriteISReg]> {
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
   // The operands are in order to match the 'addr' MI operands, so we
   // don't need an encoder method and by-name matching. Just use the default
   // in-order handling. Since we're using by-order, make sure the names
@@ -1846,7 +1846,7 @@
 class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
     : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
              [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
-      Sched<[WriteI]>;
+      Sched<[WriteI, ReadI, ReadI]>;
 
 // Split from LogicalImm as not all instructions have both.
 multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
@@ -1912,7 +1912,7 @@
 class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
     : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
          asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
-      Sched<[WriteI]> {
+      Sched<[WriteI, ReadI]> {
   let Uses = [NZCV];
   let Defs = [NZCV];
 
@@ -1944,7 +1944,7 @@
 class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
     : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
          asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
-      Sched<[WriteI]> {
+      Sched<[WriteI, ReadI, ReadI]> {
   let Uses = [NZCV];
   let Defs = [NZCV];
 
@@ -1981,7 +1981,7 @@
          asm, "\t$Rd, $Rn, $Rm, $cond", "",
          [(set regtype:$Rd,
                (ARM64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
-      Sched<[WriteI]> {
+      Sched<[WriteI, ReadI, ReadI]> {
   let Uses = [NZCV];
 
   bits<5> Rd;
@@ -2014,7 +2014,7 @@
          [(set regtype:$Rd,
                (ARM64csel regtype:$Rn, (frag regtype:$Rm),
                (i32 imm:$cond), NZCV))]>,
-      Sched<[WriteI]> {
+      Sched<[WriteI, ReadI, ReadI]> {
   let Uses = [NZCV];
 
   bits<5> Rd;
Index: lib/Target/ARM64/ARM64InstrInfo.h
===================================================================
--- lib/Target/ARM64/ARM64InstrInfo.h
+++ lib/Target/ARM64/ARM64InstrInfo.h
@@ -56,6 +56,9 @@
   unsigned isStoreToStackSlot(const MachineInstr *MI,
                               int &FrameIndex) const override;
 
+  /// \brief Is there a non-zero immediate?
+  bool hasNonZeroImm(const MachineInstr *MI) const;
+
   /// \brief Does this instruction set its full destination register to zero?
   bool isGPRZero(const MachineInstr *MI) const;
 
Index: lib/Target/ARM64/ARM64InstrInfo.cpp
===================================================================
--- lib/Target/ARM64/ARM64InstrInfo.cpp
+++ lib/Target/ARM64/ARM64InstrInfo.cpp
@@ -825,6 +825,19 @@
   return true;
 }
 
+/// Return true if this is this instruction has a non-zero immediate
+bool ARM64InstrInfo::hasNonZeroImm(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
+  }
+  return false;
+}
+
 // Return true if this instruction simply sets its single destination register
 // to zero. This is equivalent to a register rename of the zero-register.
 bool ARM64InstrInfo::isGPRZero(const MachineInstr *MI) const {
Index: lib/Target/ARM64/ARM64SchedA53.td
===================================================================
--- lib/Target/ARM64/ARM64SchedA53.td
+++ lib/Target/ARM64/ARM64SchedA53.td
@@ -20,7 +20,7 @@
   let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
   let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
   let MinLatency = 1 ;       // OperandCycles are interpreted as MinLatency.
-  let LoadLatency = 2;       // Optimistic load latency assuming bypass.
+  let LoadLatency = 3;       // Optimistic load latency assuming bypass.
                              // This is overriden by OperandCycles if the
                              // Itineraries are queried instead.
   let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
@@ -32,7 +32,7 @@
 //===----------------------------------------------------------------------===//
 // Define each kind of processor resource and number available.
 
-// Modeling each pipeline as a ProcResource using the BufferSize = 0 since 
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
 // Cortex-A53 is in-order.
 
 def A53UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
@@ -50,16 +50,16 @@
 
 let SchedModel = CortexA53Model in {
 
-// ALU - These are reduced to 1 despite a true latency of 4 in order to easily
-//       model forwarding logic. Once forwarding is properly modelled, then
-//       they'll be corrected.
-def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteAdr, [A53UnitALU]> { let Latency = 1; }
+// ALU - Despite having a full latency of 4, most of the ALU instructions can
+//       forward a cycle earlier and then two cycles earlier in the case of a
+//       shift-only instruction. These latencies will be incorrect when the
+//       result cannot be forwarded, but modeling isn't rocket surgery.
+def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 3; }
 
 // MAC
 def : WriteRes<WriteIM32, [A53UnitMAC]> { let Latency = 4; }
@@ -73,14 +73,41 @@
 def : WriteRes<WriteLD, [A53UnitLdSt]> { let Latency = 4; }
 def : WriteRes<WriteLDIdx, [A53UnitLdSt]> { let Latency = 4; }
 def : WriteRes<WriteLDHi, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
+//               below, choosing the median of 3 which makes the latency 6.
+//               May model this more carefully in the future. The remaining
+//               A53WriteVLD# types represent the 1-5 cycle issues explicitly.
+def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 6;
+                                          let ResourceCycles = [3]; }
+def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7;
+                                                  let ResourceCycles = [4]; }
+def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8;
+                                                  let ResourceCycles = [5]; }
+
+// Pre/Post Indexing - Performed as part of address generation which is already
+//                     accounted for in the WriteST* latencies below
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
 
 // Store
 def : WriteRes<WriteST, [A53UnitLdSt]> { let Latency = 4; }
 def : WriteRes<WriteSTP, [A53UnitLdSt]> { let Latency = 4; }
 def : WriteRes<WriteSTIdx, [A53UnitLdSt]> { let Latency = 4; }
 def : WriteRes<WriteSTX, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 5;
+                                          let ResourceCycles = [2];}
+def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
 
 // Branch
 def : WriteRes<WriteBr, [A53UnitB]>;
@@ -101,29 +128,143 @@
 def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; }
 def : WriteRes<WriteFDiv, [A53UnitFPMDS]> { let Latency = 33;
                                             let ResourceCycles = [29]; }
-def A53WriteFDiv : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
-                                                   let ResourceCycles = [29]; }
-def A53WriteFSqrt : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
-                                                    let ResourceCycles = [28]; }
+def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; }
+def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18;
+                                                     let ResourceCycles = [14]; }
+def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
+                                                     let ResourceCycles = [29]; }
+def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17;
+                                                      let ResourceCycles = [13]; }
+def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
+                                                      let ResourceCycles = [28]; }
 
 //===----------------------------------------------------------------------===//
 // Subtarget-specific SchedRead types.
 
-// While there is no forwarding information defined for these SchedRead types,
-// they are still used by some instruction via a SchedRW list and so these zero
-// SchedReadAdvances are required.
-
+// No forwarding for these reads.
 def : ReadAdvance<ReadExtrHi, 0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD, 0>;
 
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+//       operands are needed one cycle later if and only if they are to be
+//       shifted. Otherwise, they too are needed two cycle later.
+def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
+                             WriteISReg, WriteIEReg,WriteIS,
+                             WriteID32,WriteID64,
+                             WriteIM32,WriteIM64]>;
+def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
+                                          WriteISReg, WriteIEReg,WriteIS,
+                                          WriteID32,WriteID64,
+                                          WriteIM32,WriteIM64]>;
+def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
+                                             WriteISReg, WriteIEReg,WriteIS,
+                                             WriteID32,WriteID64,
+                                             WriteIM32,WriteIM64]>;
+def A53ReadISReg : SchedReadVariant<[
+	SchedVar<RegShiftedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, A53ReadISReg>;
+
+def A53ReadIEReg : SchedReadVariant<[
+	SchedVar<RegShiftedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, A53ReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+//       Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
+                               WriteISReg, WriteIEReg,WriteIS,
+                               WriteID32,WriteID64,
+                               WriteIM32,WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+
 //===----------------------------------------------------------------------===//
 // Subtarget-specific InstRWs.
 
+//---
+// Miscellaneous
+//---
 def : InstRW<[WriteI], (instrs COPY)>;
-def : InstRW<[WriteLD], (instregex "LD[1-4]")>;
-def : InstRW<[WriteST], (instregex "ST[1-4]")>;
-def : InstRW<[A53WriteFDiv], (instregex "^FDIV")>;
-def : InstRW<[A53WriteFSqrt], (instregex ".*SQRT.*")>;
+
+//---
+// Vector Mul with Accumulate
+//---
+//def : InstRW<[WriteIM32, A53ReadIMA], (instregex "^M(ADD|SUB)W.*")>;
+//def : InstRW<[WriteIM64, A53ReadIMA], (instregex "^M(ADD|SUB)X.*")>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2dq)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD1, A53WriteVLD1], (instregex "LDN?PS.*$")>;
+def : InstRW<[A53WriteVLD2, A53WriteVLD2], (instregex "LDN?PD.*$")>;
+def : InstRW<[A53WriteVLD4, A53WriteVLD4], (instregex "LDN?PQ.*$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVST1], (instregex "STN?P(S|D).*$")>;
+def : InstRW<[A53WriteVST2], (instregex "STN?PQ.*$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
 
 }
Index: lib/Target/ARM64/ARM64SchedCyclone.td
===================================================================
--- lib/Target/ARM64/ARM64SchedCyclone.td
+++ lib/Target/ARM64/ARM64SchedCyclone.td
@@ -851,4 +851,15 @@
 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
 
+//---
+// Unused SchedRead types
+//---
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+
 } // SchedModel = CycloneModel
Index: lib/Target/ARM64/ARM64Schedule.td
===================================================================
--- lib/Target/ARM64/ARM64Schedule.td
+++ lib/Target/ARM64/ARM64Schedule.td
@@ -25,13 +25,19 @@
 def WriteI         : SchedWrite; // ALU
 def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
 def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
+def ReadI          : SchedRead;  // ALU
+def ReadISReg      : SchedRead;  // ALU of Shifted-Reg
+def ReadIEReg      : SchedRead;  // ALU of Extended-Reg
 def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
 def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
 def WriteIS        : SchedWrite; // Shift/Scale
 def WriteID32      : SchedWrite; // 32-bit Divide
 def WriteID64      : SchedWrite; // 64-bit Divide
+def ReadID         : SchedRead;  // 32/64-bit Divide
 def WriteIM32      : SchedWrite; // 32-bit Multiply
 def WriteIM64      : SchedWrite; // 64-bit Multiply
+def ReadIM         : SchedRead;  // 32/64-bit Multiply
+def ReadIMA        : SchedRead;  // 32/64-bit Multiply Accumulate
 def WriteBr        : SchedWrite; // Branch
 def WriteBrReg     : SchedWrite; // Indirect Branch
 
@@ -44,6 +50,9 @@
 def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
 def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
 
+// Predicate for determining when a shiftable register is shifted.
+def RegShiftedPred : SchedPredicate<[{TII->hasNonZeroImm(MI)}]>;
+
 // ScaledIdxPred is true if a WriteLDIdx operand will be
 // scaled. Subtargets can use this to dynamically select resources and
 // latency for WriteLDIdx and ReadAdrBase.
Index: test/CodeGen/ARM64/misched-basic-A53.ll
===================================================================
--- test/CodeGen/ARM64/misched-basic-A53.ll
+++ test/CodeGen/ARM64/misched-basic-A53.ll
@@ -8,9 +8,7 @@
 ; CHECK: ********** MI Scheduling **********
 ; CHECK: main
 ; CHECK: *** Final schedule for BB#2 ***
-; CHECK: SU(13)
 ; CHECK: MADDWrrr
-; CHECK: SU(4)
 ; CHECK: ADDWri
 ; CHECK: ********** INTERVALS **********
 @main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
Index: test/CodeGen/ARM64/misched-forwarding-A53.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM64/misched-forwarding-A53.ll
@@ -0,0 +1,21 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+;
+; For Cortex-A53, shiftable operands that are not actually shifted
+; are not needed for an additional two cycles.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: shiftable
+; CHECK: *** Final schedule for BB#0 ***
+; CHECK: ADDXrr %vreg0, %vreg2
+; CHECK: ADDXrs %vreg0, %vreg2, 5
+; CHECK: ********** INTERVALS **********
+define i64 @shiftable(i64 %A, i64 %B) {
+        %tmp0 = sub i64 %B, 20
+        %tmp1 = shl i64 %tmp0, 5;
+        %tmp2 = add i64 %A, %tmp1;
+        %tmp3 = add i64 %A, %tmp0
+        %tmp4 = mul i64 %tmp2, %tmp3
+
+        ret i64 %tmp4
+}