Index: include/llvm/CodeGen/MachineCombinerPattern.h
===================================================================
--- include/llvm/CodeGen/MachineCombinerPattern.h
+++ include/llvm/CodeGen/MachineCombinerPattern.h
@@ -38,7 +38,10 @@
   MULSUBX_OP1,
   MULSUBX_OP2,
   MULADDXI_OP1,
-  MULSUBXI_OP1
+  MULSUBXI_OP1,
+
+  KRYO_LSL_ADDWS,
+  KRYO_LSL_ADDXS,
 };
 
 } // end namespace llvm
Index: lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.cpp
+++ lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2656,6 +2656,92 @@
   }
 }
 
+static bool isCannonicalizeCandidate(MachineBasicBlock &MBB, MachineOperand &MO,
+                                     unsigned LSLOpc, unsigned &ShiftVal) {
+  MachineInstr *MI = nullptr;
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // We need a virtual register definition.
+  if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    MI = MRI.getUniqueVRegDef(MO.getReg());
+  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != LSLOpc)
+    return false;
+  // Must only used by the user we cannonicalize with.
+  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+    return false;
+
+  const MachineOperand &Op2 = MI->getOperand(2);
+  const MachineOperand &Op3 = MI->getOperand(3);
+
+  if (!Op2.isImm() || !Op3.isImm())
+    return false;
+
+  int64_t immr = Op2.getImm();
+  int64_t imms = Op3.getImm();
+  if (LSLOpc == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr))
+    ShiftVal = 31 - imms;
+  else if (LSLOpc == AArch64::UBFMXri && imms != 0x3F && ((imms + 1 == immr)))
+    ShiftVal = 63 - imms;
+  else
+    return false;
+
+  return true;
+}
+
+// In Kryo if an commutative instruction has a LSL for both operands and if the
+// LSL can be folded into the instruction's shifted register (e.g., add x0, x1,
+// x2, lsl #3) then we should canonicalize the operands so the smaller (in terms
+// of the number of shifts) is the operands that is folded.
+//
+// For example, rather than
+//
+// lsl x1, x1, #1
+// add x0, x1, x2, lsl #4
+//
+// we should prefer
+//
+// lsl x2, x2, #4
+// add x0, x2, x1, lsl #1
+//
+// as this safes a cycle on the add instruction.
+static bool
+getCannonicalizePatterns(MachineInstr &Root,
+                         SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+  unsigned Opc = Root.getOpcode();
+  MachineBasicBlock &MBB = *Root.getParent();
+  unsigned LSLOpc;
+  unsigned ShiftVal1, ShiftVal2;
+  MachineCombinerPattern Pattern;
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::ADDWrs:
+    LSLOpc = AArch64::UBFMWri;
+    Pattern = MachineCombinerPattern::KRYO_LSL_ADDWS;
+    break;
+  case AArch64::ADDXrs:
+    LSLOpc = AArch64::UBFMXri;
+    Pattern = MachineCombinerPattern::KRYO_LSL_ADDXS;
+    break;
+  }
+
+  if (!isCannonicalizeCandidate(MBB, Root.getOperand(1), LSLOpc, ShiftVal1))
+    return false;
+
+  if (!Root.getOperand(3).isImm())
+    return false;
+  unsigned val = Root.getOperand(3).getImm();
+  if (!val || AArch64_AM::getShiftType(val) != AArch64_AM::LSL)
+    return false;
+  ShiftVal2 = AArch64_AM::getShiftValue(val);
+
+  if (ShiftVal1 >= ShiftVal2)
+    return false;
+
+  Patterns.push_back(Pattern);
+  return true;
+}
+
 /// Find instructions that can be turned into madd.
 static bool getMaddPatterns(MachineInstr &Root,
                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
@@ -2771,12 +2857,94 @@
 bool AArch64InstrInfo::getMachineCombinerPatterns(
     MachineInstr &Root,
     SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  if (Subtarget.isKryo() && getCannonicalizePatterns(Root, Patterns))
+    return true;
+
   if (getMaddPatterns(Root, Patterns))
     return true;
 
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
 }
 
+// Cannonicalize from
+// B = LSL A, #shift1
+// D = ADD B, C, lsl #shift2
+//
+// to
+// B = LSL C, #shift2
+// D = ADD B, A, lsl #shift1
+static void CannonicalizeOperands(MachineFunction &MF, MachineRegisterInfo &MRI,
+                                  const TargetInstrInfo *TII,
+                                  MachineInstr &Root,
+                                  MachineCombinerPattern Pattern,
+                                  SmallVectorImpl<MachineInstr *> &InsInstrs,
+                                  SmallVectorImpl<MachineInstr *> &DelInstrs) {
+  MachineInstr *LSL = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+  const TargetRegisterClass *RC;
+
+  unsigned ShiftVal1, ShiftVal2;
+  ShiftVal2 = AArch64_AM::getShiftValue(Root.getOperand(3).getImm());
+  unsigned ImmR2;
+  int64_t ImmS1 = LSL->getOperand(3).getImm();
+  switch (Pattern) {
+  case MachineCombinerPattern::KRYO_LSL_ADDWS:
+    ShiftVal1 = 31 - ImmS1;
+    ImmR2 = 32 - ShiftVal2;
+    RC = &AArch64::GPR32RegClass;
+    break;
+  case MachineCombinerPattern::KRYO_LSL_ADDXS:
+    ShiftVal1 = 63 - ImmS1;
+    ImmR2 = 64 - ShiftVal2;
+    RC = &AArch64::GPR64RegClass;
+    break;
+  default:
+    llvm_unreachable("unexpected MachineCombinerPattern");
+  }
+  unsigned ImmS2 = ImmR2 - 1;
+
+  MachineOperand &OpA = LSL->getOperand(1);
+  MachineOperand &OpB = LSL->getOperand(0);
+  MachineOperand &OpC = Root.getOperand(2);
+  MachineOperand &OpD = Root.getOperand(0);
+
+  unsigned RegA = OpA.getReg();
+  unsigned RegB = OpB.getReg();
+  unsigned RegC = OpC.getReg();
+  unsigned RegD = OpD.getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(RegA))
+    MRI.constrainRegClass(RegA, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegB))
+    MRI.constrainRegClass(RegB, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegC))
+    MRI.constrainRegClass(RegC, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegD))
+    MRI.constrainRegClass(RegD, RC);
+
+  unsigned AddOpc = Root.getOpcode();
+  unsigned LSLOpc = LSL->getOpcode();
+  bool KillA = OpA.isKill();
+  bool KillB = OpB.isKill();
+  bool KillC = OpC.isKill();
+
+  // Create new instructions for insertion.
+  MachineInstrBuilder MIB1 =
+      BuildMI(MF, LSL->getDebugLoc(), TII->get(LSLOpc), RegB)
+          .addReg(RegC, getKillRegState(KillC))
+          .addImm(ImmR2)
+          .addImm(ImmS2);
+  MachineInstrBuilder MIB2 =
+      BuildMI(MF, Root.getDebugLoc(), TII->get(AddOpc), RegD)
+          .addReg(RegB, getKillRegState(KillB))
+          .addReg(RegA, getKillRegState(KillA))
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftVal1));
+
+  InsInstrs.push_back(MIB1);
+  InsInstrs.push_back(MIB2);
+  DelInstrs.push_back(LSL);
+  DelInstrs.push_back(&Root);
+}
+
 /// genMadd - Generate madd instruction and combine mul and add.
 /// Example:
 ///  MUL I=A,B,0
@@ -2894,6 +3062,10 @@
     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
                                                 DelInstrs, InstrIdxForVirtReg);
     return;
+  case MachineCombinerPattern::KRYO_LSL_ADDWS:
+  case MachineCombinerPattern::KRYO_LSL_ADDXS:
+    CannonicalizeOperands(MF, MRI, TII, Root, Pattern, InsInstrs, DelInstrs);
+    return;
   case MachineCombinerPattern::MULADDW_OP1:
   case MachineCombinerPattern::MULADDX_OP1:
     // MUL I=A,B,0
Index: test/CodeGen/AArch64/kryo-lsl-addrs.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/kryo-lsl-addrs.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s
+
+; Verify that the shift amount in the add instruction is alwarys the smaller 
+; one.
+
+define i32 @lsl_add1(i32 %a, i32 %b) {
+; CHECK-LABEL:   lsl_add1:
+; CHECK:         lsl	w8, w0, #3 
+; CHECK-NEXT:    add	w0, w8, w1, lsl #2   
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i32 %a, 3
+  %shl1 = shl i32 %b, 2
+  %add = add i32 %shl1, %shl
+  ret i32 %add
+}
+
+define i32 @lsl_add2(i32 %a, i32 %b) {
+; CHECK-LABEL:   lsl_add2:
+; CHECK:         lsl    w8, w1, #3 
+; CHECK-NEXT:    add    w0, w8, w0, lsl #2   
+; CHECK-NEXT:    ret
+
+entry:
+  %shl = shl i32 %a, 2
+  %shl1 = shl i32 %b, 3
+  %add = add i32 %shl1, %shl
+  ret i32 %add
+}
+
+define i64 @lsl_add3(i64 %a, i64 %b) {
+; CHECK-LABEL:   lsl_add3:
+; CHECK:         lsl    x8, x0, #3 
+; CHECK-NEXT:    add    x0, x8, x1, lsl #2   
+; CHECK-NEXT:    ret
+
+entry:
+  %shl = shl i64 %a, 3
+  %shl1 = shl i64 %b, 2
+  %add = add i64 %shl1, %shl
+  ret i64 %add
+}
+
+define i64 @lsl_add4(i64 %a, i64 %b) {
+; CHECK-LABEL:   lsl_add4:
+; CHECK:         lsl    x8, x1, #3 
+; CHECK-NEXT:    add    x0, x8, x0, lsl #2   
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i64 %a, 2
+  %shl1 = shl i64 %b, 3
+  %add = add i64 %shl1, %shl
+  ret i64 %add
+}
+