Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1460,6 +1460,21 @@
   return IsLegal;
 }
 
+// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
+// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
+// can be set differently by this operation. It comes down to whether
+// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+// everything is fine. If not then the optimization is wrong. Thus general
+// comparisons are only valid if op2 != 0.
+//
+// So, finally, the only LLVM-native comparisons that don't mention C and V
+// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+// the absence of information about op2.
+static bool isCMN(SDValue Op, ISD::CondCode CC) {
+  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
+         (CC == ISD::SETEQ || CC == ISD::SETNE);
+}
+
 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               const SDLoc &dl, SelectionDAG &DAG) {
   EVT VT = LHS.getValueType();
@@ -1482,18 +1497,8 @@
   // register to WZR/XZR if it ends up being unused.
   unsigned Opcode = AArch64ISD::SUBS;
 
-  if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
-      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
-    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
-    // can be set differently by this operation. It comes down to whether
-    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
-    // everything is fine. If not then the optimization is wrong. Thus general
-    // comparisons are only valid if op2 != 0.
-
-    // So, finally, the only LLVM-native comparisons that don't mention C and V
-    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
-    // the absence of information about op2.
+  if (isCMN(RHS, CC)) {
+    // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
     Opcode = AArch64ISD::ADDS;
     RHS = RHS.getOperand(1);
   } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
@@ -1765,6 +1770,42 @@
 
 /// @}
 
+/// Returns how profitable it is to fold a comparison's operand's shift and/or
+/// extension operations.
+static unsigned getCmpOperandFoldingProfit(SDValue Op) {
+  auto isSupportedExtend = [&](SDValue V) {
+    if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
+      return true;
+
+    if (V.getOpcode() == ISD::AND)
+      if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
+        uint64_t Mask = MaskCst->getZExtValue();
+        return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
+      }
+
+    return false;
+  };
+
+  if (!Op.hasOneUse())
+    return 0;
+
+  if (isSupportedExtend(Op))
+    return 1;
+
+  unsigned Opc = Op.getOpcode();
+  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
+    if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      uint64_t Shift = ShiftCst->getZExtValue();
+      if (isSupportedExtend(Op.getOperand(0)))
+        return (Shift <= 4) ? 2 : 1;
+      EVT VT = Op.getValueType();
+      if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
+        return 1;
+    }
+
+  return 0;
+}
+
 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                              SDValue &AArch64cc, SelectionDAG &DAG,
                              const SDLoc &dl) {
@@ -1822,6 +1863,27 @@
       }
     }
   }
+
+  // Comparisons are canonicalized so that the RHS operand is simpler than the
+  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
+  // can fold some shift+extend operations on the RHS operand, so swap the
+  // operands if that can be done.
+  //
+  // For example:
+  //    lsl     w13, w11, #1
+  //    cmp     w13, w12
+  // can be turned into:
+  //    cmp     w12, w11, lsl #1
+  if (!isa<ConstantSDNode>(RHS) ||
+      !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
+    SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
+
+    if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
+      std::swap(LHS, RHS);
+      CC = ISD::getSetCCSwappedOperands(CC);
+    }
+  }
+
   SDValue Cmp;
   AArch64CC::CondCode AArch64CC;
   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
Index: llvm/trunk/test/CodeGen/AArch64/and-mask-removal.ll
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/and-mask-removal.ll
+++ llvm/trunk/test/CodeGen/AArch64/and-mask-removal.ll
@@ -179,7 +179,9 @@
 ret_true:
   ret i1 true
 ; CHECK-LABEL: test16_2
-; CHECK: and
+; CHECK: mov	[[CST:w[0-9]+]], #16882
+; CHECK: add	[[ADD:w[0-9]+]], w0, [[CST]]
+; CHECK: cmp	{{.*}}, [[ADD]], uxth
 ; CHECK: ret
 }
 
@@ -207,7 +209,9 @@
 ret_true:
   ret i1 true
 ; CHECK-LABEL: test16_4
-; CHECK: and
+; CHECK: mov	[[CST:w[0-9]+]], #29985
+; CHECK: add	[[ADD:w[0-9]+]], w0, [[CST]]
+; CHECK: cmp	{{.*}}, [[ADD]], uxth
 ; CHECK: ret
 }
 
@@ -249,7 +253,9 @@
 ret_true:
   ret i1 true
 ; CHECK-LABEL: test16_7
-; CHECK: and
+; CHECK: mov	[[CST:w[0-9]+]], #9272
+; CHECK: add	[[ADD:w[0-9]+]], w0, [[CST]]
+; CHECK: cmp	{{.*}}, [[ADD]], uxth
 ; CHECK: ret
 }
 
Index: llvm/trunk/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll
+++ llvm/trunk/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll
@@ -35,8 +35,7 @@
 define i1 @shifts_necmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 16 ; 32-16
@@ -48,8 +47,7 @@
 define i1 @shifts_necmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 24 ; 32-8
@@ -61,8 +59,7 @@
 define i1 @shifts_necmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 32 ; 64-32
@@ -74,8 +71,7 @@
 define i1 @shifts_necmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 48 ; 64-16
@@ -87,8 +83,7 @@
 define i1 @shifts_necmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 56 ; 64-8
@@ -117,8 +112,7 @@
 define i1 @add_ultcmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -32768 ; ~0U << (16-1)
@@ -129,8 +123,7 @@
 define i1 @add_ultcmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -128 ; ~0U << (8-1)
@@ -141,8 +134,7 @@
 define i1 @add_ultcmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -2147483648 ; ~0U << (32-1)
@@ -153,8 +145,7 @@
 define i1 @add_ultcmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -32768 ; ~0U << (16-1)
@@ -165,8 +156,7 @@
 define i1 @add_ultcmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -128 ; ~0U << (8-1)
@@ -208,8 +198,7 @@
 define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 32768 ; 1U << (16-1)
@@ -220,8 +209,7 @@
 define i1 @add_ugecmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 128 ; 1U << (8-1)
@@ -232,8 +220,7 @@
 define i1 @add_ugecmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 2147483648 ; 1U << (32-1)
@@ -244,8 +231,7 @@
 define i1 @add_ugecmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 32768 ; 1U << (16-1)
@@ -256,8 +242,7 @@
 define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 128 ; 1U << (8-1)
Index: llvm/trunk/test/CodeGen/AArch64/sat-add.ll
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/sat-add.ll
+++ llvm/trunk/test/CodeGen/AArch64/sat-add.ll
@@ -52,11 +52,10 @@
 define i16 @unsigned_sat_constant_i16_using_min(i16 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i16_using_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w9, #65493
-; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w8, #65493
+; CHECK-NEXT:    cmp w8, w0, uxth
 ; CHECK-NEXT:    mov w8, #-43
-; CHECK-NEXT:    csel w8, w0, w8, lo
+; CHECK-NEXT:    csel w8, w0, w8, hi
 ; CHECK-NEXT:    add w0, w8, #42 // =42
 ; CHECK-NEXT:    ret
   %c = icmp ult i16 %x, -43
@@ -82,11 +81,10 @@
 define i16 @unsigned_sat_constant_i16_using_cmp_notval(i16 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w10, #65493
-; CHECK-NEXT:    add w9, w0, #42 // =42
-; CHECK-NEXT:    cmp w8, w10
-; CHECK-NEXT:    csinv w0, w9, wzr, ls
+; CHECK-NEXT:    mov w9, #65493
+; CHECK-NEXT:    add w8, w0, #42 // =42
+; CHECK-NEXT:    cmp w9, w0, uxth
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %a = add i16 %x, 42
   %c = icmp ugt i16 %x, -43
Index: llvm/trunk/test/CodeGen/AArch64/signed-truncation-check.ll
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/signed-truncation-check.ll
+++ llvm/trunk/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -35,8 +35,7 @@
 define i1 @shifts_eqcmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 16 ; 32-16
@@ -48,8 +47,7 @@
 define i1 @shifts_eqcmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 24 ; 32-8
@@ -61,8 +59,7 @@
 define i1 @shifts_eqcmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 32 ; 64-32
@@ -74,8 +71,7 @@
 define i1 @shifts_eqcmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 48 ; 64-16
@@ -87,8 +83,7 @@
 define i1 @shifts_eqcmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 56 ; 64-8
@@ -117,8 +112,7 @@
 define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -32768 ; ~0U << (16-1)
@@ -129,8 +123,7 @@
 define i1 @add_ugecmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -128 ; ~0U << (8-1)
@@ -141,8 +134,7 @@
 define i1 @add_ugecmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -2147483648 ; ~0U << (32-1)
@@ -153,8 +145,7 @@
 define i1 @add_ugecmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -32768 ; ~0U << (16-1)
@@ -165,8 +156,7 @@
 define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -128 ; ~0U << (8-1)
@@ -208,8 +198,7 @@
 define i1 @add_ultcmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 32768 ; 1U << (16-1)
@@ -220,8 +209,7 @@
 define i1 @add_ultcmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 128 ; 1U << (8-1)
@@ -232,8 +220,7 @@
 define i1 @add_ultcmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 2147483648 ; 1U << (32-1)
@@ -244,8 +231,7 @@
 define i1 @add_ultcmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 32768 ; 1U << (16-1)
@@ -256,8 +242,7 @@
 define i1 @add_ultcmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 128 ; 1U << (8-1)
Index: llvm/trunk/test/CodeGen/AArch64/swap-compare-operands.ll
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/swap-compare-operands.ll
+++ llvm/trunk/test/CodeGen/AArch64/swap-compare-operands.ll
@@ -0,0 +1,632 @@
+; RUN: llc < %s -mtriple=arm64 | FileCheck %s
+
+define i1 @testSwapCmpWithLSL64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSL64_1:
+; CHECK:      cmp     x1, x0, lsl #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i64 %a, 1
+  %cmp = icmp slt i64 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSL64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSL64_63:
+; CHECK:      cmp     x1, x0, lsl #63
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i64 %a, 63
+  %cmp = icmp slt i64 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSL32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSL32_1:
+; CHECK:      cmp     w1, w0, lsl #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i32 %a, 1
+  %cmp = icmp slt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSL32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSL32_31:
+; CHECK:      cmp     w1, w0, lsl #31
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i32 %a, 31
+  %cmp = icmp slt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSR64_1:
+; CHECK:      cmp     x1, x0, lsr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i64 %a, 1
+  %cmp = icmp slt i64 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSR64_63:
+; CHECK:      cmp     x1, x0, lsr #63
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i64 %a, 63
+  %cmp = icmp slt i64 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSR32_1:
+; CHECK:      cmp     w1, w0, lsr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i32 %a, 1
+  %cmp = icmp slt i32 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSR32_31:
+; CHECK:      cmp     w1, w0, lsr #31
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i32 %a, 31
+  %cmp = icmp slt i32 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithASR64_1:
+; CHECK:      cmp     x1, x0, asr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i64 %a, 1
+  %cmp = icmp slt i64 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithASR64_63:
+; CHECK:      cmp     x1, x0, asr #63
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i64 %a, 63
+  %cmp = icmp slt i64 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithASR32_1:
+; CHECK:      cmp     w1, w0, asr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i32 %a, 1
+  %cmp = icmp slt i32 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithASR32_31:
+; CHECK:      cmp     w1, w0, asr #31
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i32 %a, 31
+  %cmp = icmp slt i32 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend32_64(i32 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend32_64
+; CHECK:      cmp    x1, w0, uxtw #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = zext i32 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend16_64(i16 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend16_64
+; CHECK:      cmp    x1, w0, uxth #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = zext i16 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend8_64(i8 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64
+; CHECK:      cmp    x1, w0, uxtb #4
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a64 = zext i8 %a to i64
+  %shl.2 = shl i64 %a64, 4
+  %cmp = icmp ugt i64 %shl.2, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend16_32(i16 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64
+; CHECK:      cmp    w1, w0, uxth #3
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a32 = zext i16 %a to i32
+  %shl = shl i32 %a32, 3
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64
+; CHECK:      cmp    w1, w0, uxtb #4
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a32 = zext i8 %a to i32
+  %shl = shl i32 %a32, 4
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithTooLargeShiftedZeroExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithTooLargeShiftedZeroExtend8_64
+; CHECK:      and    [[REG:w[0-9]+]], w0, #0xff
+; CHECK:      cmp    w1, [[REG]], lsl #5
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = zext i8 %a to i32
+  %shl = shl i32 %a32, 5
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithZeroExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithZeroExtend8_64
+; CHECK:      cmp    w1, w0, uxtb
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = zext i8 %a to i32
+  %cmp = icmp ugt i32 %a32, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend32_64(i32 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend32_64
+; CHECK:      cmp    x1, w0, sxtw #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = sext i32 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend16_64(i16 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend16_64
+; CHECK:      cmp    x1, w0, sxth #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = sext i16 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend8_64(i8 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64
+; CHECK:      cmp    x1, w0, sxtb #4
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a64 = sext i8 %a to i64
+  %shl.2 = shl i64 %a64, 4
+  %cmp = icmp ugt i64 %shl.2, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend16_32(i16 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64
+; CHECK:      cmp    w1, w0, sxth #3
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a32 = sext i16 %a to i32
+  %shl = shl i32 %a32, 3
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64
+; CHECK:      cmp    w1, w0, sxtb #4
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = sext i8 %a to i32
+  %shl = shl i32 %a32, 4
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithTooLargeShiftedSignExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithTooLargeShiftedSignExtend8_64
+; CHECK:      sxtb   [[REG:w[0-9]+]], w0
+; CHECK-NEXT: cmp    w1, [[REG]], lsl #5
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = sext i8 %a to i32
+  %shl = shl i32 %a32, 5
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithSignExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithSignExtend8_64
+; CHECK:      cmp    w1, w0, sxtb
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = sext i8 %a to i32
+  %cmp = icmp ugt i32 %a32, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSL64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSL64_1:
+; CHECK:      cmn    x1, x0, lsl #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i64 %a, 1
+  %na = sub i64 0, %shl
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 62 bits shift as 63 has another optimization kicking in.
+define i1 @testSwapCmnWithLSL64_62(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSL64_62:
+; CHECK:      cmn    x1, x0, lsl #62
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i64 %a, 62
+  %na = sub i64 0, %shl
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 63 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSL64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSL64_63:
+; CHECK:      cmp    x1, x0, lsl #63
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i64 %a, 63
+  %na = sub i64 0, %shl
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSL32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSL32_1:
+; CHECK:      cmn    w1, w0, lsl #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i32 %a, 1
+  %na = sub i32 0, %shl
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 30 bits shift as 30 has another optimization kicking in.
+define i1 @testSwapCmnWithLSL32_30(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSL32_30:
+; CHECK:      cmn    w1, w0, lsl #30
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i32 %a, 30
+  %na = sub i32 0, %shl
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 31 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSL32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSL32_31:
+; CHECK:      cmp    w1, w0, lsl #31
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i32 %a, 31
+  %na = sub i32 0, %shl
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSR64_1:
+; CHECK:      cmn    x1, x0, lsr #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i64 %a, 1
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 62 bits shift as 63 has another optimization kicking in.
+define i1 @testSwapCmnWithLSR64_62(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSR64_62:
+; CHECK:      cmn    x1, x0, lsr #62
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i64 %a, 62
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 63 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSR64_63:
+; CHECK:      cmp    x1, x0, asr #63
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i64 %a, 63
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSR32_1:
+; CHECK:      cmn    w1, w0, lsr #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i32 %a, 1
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 30 bits shift as 31 has another optimization kicking in.
+define i1 @testSwapCmnWithLSR32_30(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSR32_30:
+; CHECK:      cmn    w1, w0, lsr #30
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i32 %a, 30
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 31 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSR32_31:
+; CHECK:      cmp    w1, w0, asr #31
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i32 %a, 31
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithASR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithASR64_1:
+; CHECK:      cmn    x1, x0, asr #3
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i64 %a, 3
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 62 bits shift as 63 has another optimization kicking in.
+define i1 @testSwapCmnWithASR64_62(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithASR64_62:
+; CHECK:      cmn    x1, x0, asr #62
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i64 %a, 62
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 63 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithASR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithASR64_63:
+; CHECK:      cmp    x1, x0, lsr #63
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i64 %a, 63
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithASR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithASR32_1:
+; CHECK:      cmn    w1, w0, asr #1
+; CHECK-NEXT: cset   w0, eq
+entry:
+  %lshr = ashr i32 %a, 1
+  %na = sub i32 0, %lshr
+  %cmp = icmp eq i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 30 bits shift as 31 has another optimization kicking in.
+define i1 @testSwapCmnWithASR32_30(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithASR32_30:
+; CHECK:      cmn    w1, w0, asr #30
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i32 %a, 30
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 31 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithASR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithASR32_31:
+; CHECK:      cmp    w1, w0, lsr #31
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i32 %a, 31
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+define i64 @testSwapCmpToCmnWithZeroExtend(i32 %a32, i16 %a16, i8 %a8, i64 %b64, i32 %b32) {
+; CHECK-LABEL testSwapCmpToCmnWithZeroExtend:
+t0:
+  %conv0 = zext i32 %a32 to i64
+  %shl0 = shl i64 %conv0, 1
+  %na0 = sub i64 0, %shl0
+  %cmp0 = icmp ne i64 %na0, %b64
+; CHECK: cmn    x3, w0, uxtw #1
+  br i1 %cmp0, label %t1, label %end
+
+t1:
+  %conv1 = zext i16 %a16 to i64
+  %shl1 = shl i64 %conv1, 4
+  %na1 = sub i64 0, %shl1
+  %cmp1 = icmp ne i64 %na1, %b64
+; CHECK: cmn    x3, w1, uxth #4
+  br i1 %cmp1, label %t2, label %end
+
+t2:
+  %conv2 = zext i8 %a8 to i64
+  %shl2 = shl i64 %conv2, 3
+  %na2 = sub i64 0, %shl2
+  %cmp2 = icmp ne i64 %na2, %b64
+; CHECK: cmn    x3, w2, uxtb #3
+  br i1 %cmp2, label %t3, label %end
+
+t3:
+  %conv3 = zext i16 %a16 to i32
+  %shl3 = shl i32 %conv3, 2
+  %na3 = sub i32 0, %shl3
+  %cmp3 = icmp ne i32 %na3, %b32
+; CHECK: cmn    w4, w1, uxth #2
+  br i1 %cmp3, label %t4, label %end
+
+t4:
+  %conv4 = zext i8 %a8 to i32
+  %shl4 = shl i32 %conv4, 1
+  %na4 = sub i32 0, %shl4
+  %cmp4 = icmp ne i32 %na4, %b32
+; CHECK: cmn    w4, w2, uxtb #1
+  br i1 %cmp4, label %t5, label %end
+
+t5:
+  %conv5 = zext i8 %a8 to i32
+  %shl5 = shl i32 %conv5, 5
+  %na5 = sub i32 0, %shl5
+  %cmp5 = icmp ne i32 %na5, %b32
+; CHECK: and    [[REG:w[0-9]+]], w2, #0xff
+; CHECK: cmn    w4, [[REG]], lsl #5
+  br i1 %cmp5, label %t6, label %end
+
+t6:
+  %conv6 = zext i8 %a8 to i32
+  %na6 = sub i32 0, %conv6
+  %cmp6 = icmp ne i32 %na6, %b32
+; CHECK: cmn    w4, w2, uxtb
+  br i1 %cmp6, label %t7, label %end
+
+t7:
+  ret i64 0
+
+end:
+  ret i64 1
+}
+define i64 @testSwapCmpToCmnWithSignExtend(i32 %a32, i16 %a16, i8 %a8, i64 %b64, i32 %b32) {
+; CHECK-LABEL testSwapCmpToCmnWithSignExtend:
+t0:
+  %conv0 = sext i32 %a32 to i64
+  %shl0 = shl i64 %conv0, 1
+  %na0 = sub i64 0, %shl0
+  %cmp0 = icmp ne i64 %na0, %b64
+; CHECK: cmn     x3, w0, sxtw #1
+  br i1 %cmp0, label %t1, label %end
+
+t1:
+  %conv1 = sext i16 %a16 to i64
+  %shl1 = shl i64 %conv1, 4
+  %na1 = sub i64 0, %shl1
+  %cmp1 = icmp ne i64 %na1, %b64
+; CHECK: cmn     x3, w1, sxth #4
+  br i1 %cmp1, label %t2, label %end
+
+t2:
+  %conv2 = sext i8 %a8 to i64
+  %shl2 = shl i64 %conv2, 3
+  %na2 = sub i64 0, %shl2
+  %cmp2 = icmp ne i64 %na2, %b64
+; CHECK: cmn     x3, w2, sxtb #3
+  br i1 %cmp2, label %t3, label %end
+
+t3:
+  %conv3 = sext i16 %a16 to i32
+  %shl3 = shl i32 %conv3, 2
+  %na3 = sub i32 0, %shl3
+  %cmp3 = icmp ne i32 %na3, %b32
+; CHECK: cmn     w4, w1, sxth #2
+  br i1 %cmp3, label %t4, label %end
+
+t4:
+  %conv4 = sext i8 %a8 to i32
+  %shl4 = shl i32 %conv4, 1
+  %na4 = sub i32 0, %shl4
+  %cmp4 = icmp ne i32 %na4, %b32
+; CHECK: cmn     w4, w2, sxtb #1
+  br i1 %cmp4, label %t5, label %end
+
+t5:
+  %conv5 = sext i8 %a8 to i32
+  %shl5 = shl i32 %conv5, 5
+  %na5 = sub i32 0, %shl5
+  %cmp5 = icmp ne i32 %na5, %b32
+; CHECK: sxtb    [[REG:w[0-9]+]], w2
+; CHECK: cmn     w4, [[REG]], lsl #5
+  br i1 %cmp5, label %t6, label %end
+
+t6:
+  %conv6 = sext i8 %a8 to i32
+  %na6 = sub i32 0, %conv6
+  %cmp6 = icmp ne i32 %na6, %b32
+; CHECK: cmn     w4, w2, sxtb
+  br i1 %cmp6, label %t7, label %end
+
+t7:
+  ret i64 0
+
+end:
+  ret i64 1
+}