Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1753,6 +1753,25 @@
   return VT.changeVectorElementTypeToInteger();
 }
 
+// isIntImmediate - This method tests to see if the node is a constant
+// operand. If so Imm will receive the value.
+static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
+  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
+    Imm = C->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the value.
+static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
+                                  uint64_t &Imm) {
+  return N->getOpcode() == Opc &&
+         isIntImmediate(N->getOperand(1).getNode(), Imm);
+}
+
 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
                                const APInt &Demanded,
                                TargetLowering::TargetLoweringOpt &TLO,
@@ -16573,6 +16592,40 @@
   return SDValue();
 }
 
+static SDValue performAddCombineForShiftedOperands(SDNode *N,
+                                                   SelectionDAG &DAG) {
+  // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
+  // commutative.
+  if (N->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
+  // shifted register is only available for i32 and i64.
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  uint64_t LHSImm = 0, RHSImm = 0;
+  // If both operand are shifted by imm and shift amount is not greater than 4
+  // for one operand, swap LHS and RHS to put operand with smaller shift amount
+  // on RHS.
+  //
+  // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
+  // LSL shift (shift <= 4) has smaller latency and larger throughput than ADDD
+  // with LSL (shift > 4). For the rest of processors, this is no-op for
+  // performance or correctness.
+  if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
+      isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
+      RHSImm > 4 && LHS.hasOneUse())
+    return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
+
+  return SDValue();
+}
+
 static SDValue performAddSubCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
@@ -16587,6 +16640,8 @@
     return Val;
   if (SDValue Val = performVectorAddSubExtCombine(N, DAG))
     return Val;
+  if (SDValue Val = performAddCombineForShiftedOperands(N, DAG))
+    return Val;
 
   return performAddSubLongCombine(N, DCI, DAG);
 }
Index: llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
===================================================================
--- llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
+++ llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -292,8 +292,8 @@
 define i64 @add_swap_rhs_lhs_i64(i64 %0, i64 %1) {
 ; CHECK-LABEL: add_swap_rhs_lhs_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl x8, x1, #3
-; CHECK-NEXT:    add x0, x8, x0, lsl #8
+; CHECK-NEXT:    lsl x8, x0, #8
+; CHECK-NEXT:    add x0, x8, x1, lsl #3
 ; CHECK-NEXT:    ret
   %3 = shl i64 %0, 8
   %4 = shl i64 %1, 3
@@ -318,8 +318,8 @@
 define i32 @add_swap_rhs_lhs_i32(i32 %0, i32 %1) {
 ; CHECK-LABEL: add_swap_rhs_lhs_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w1, #3
-; CHECK-NEXT:    add w0, w8, w0, lsl #8
+; CHECK-NEXT:    lsl w8, w0, #8
+; CHECK-NEXT:    add w0, w8, w1, lsl #3
 ; CHECK-NEXT:    ret
   %3 = shl i32 %0, 8
   %4 = shl i32 %1, 3