Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2570,7 +2570,8 @@
     // bits that are implicitly ANDed off by the above opcodes and if so, skip
     // the AND.
     uint64_t MaskImm;
-    if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm))
+    if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) &&
+        !isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm))
       return false;
 
     if (countTrailingOnes(MaskImm) < Bits)
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1673,14 +1673,22 @@
     // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
     Opcode = AArch64ISD::ADDS;
     LHS = LHS.getOperand(1);
-  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
-             !isUnsignedIntSetCC(CC)) {
-    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
-    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
-    // of the signed comparisons.
-    Opcode = AArch64ISD::ANDS;
-    RHS = LHS.getOperand(1);
-    LHS = LHS.getOperand(0);
+  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
+    if (LHS.getOpcode() == ISD::AND) {
+      // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+      // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+      // of the signed comparisons.
+      const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
+                                           DAG.getVTList(VT, MVT_CC),
+                                           LHS.getOperand(0),
+                                           LHS.getOperand(1));
+      // Replace all users of (and X, Y) with newly generated (ands X, Y)
+      DAG.ReplaceAllUsesWith(LHS, ANDSNode);
+      return ANDSNode.getValue(1);
+    } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
+      // Use result of ANDS
+      return LHS.getValue(1);
+    }
   }
 
   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
Index: llvm/test/CodeGen/AArch64/funnel-shift.ll
===================================================================
--- llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -18,12 +18,11 @@
 define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: fshl_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsl w8, w0, w2
 ; CHECK-NEXT:    lsr w9, w1, w9
 ; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w0, w8, eq
 ; CHECK-NEXT:    ret
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@@ -146,12 +145,11 @@
 define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: fshr_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsr w8, w1, w2
 ; CHECK-NEXT:    lsl w9, w0, w9
 ; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w1, w8, eq
 ; CHECK-NEXT:    ret
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
Index: llvm/test/CodeGen/AArch64/peephole-and-tst.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+
+%struct.anon = type { i32*, i32* }
+
+@ptr_wrapper = common dso_local local_unnamed_addr global %struct.anon* null, align 8
+
+define dso_local i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) local_unnamed_addr {
+entry:
+  %0 = load %struct.anon*, %struct.anon** @ptr_wrapper, align 8
+  %result = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1
+  %tobool2 = icmp ne i32 %mask, 0
+  br label %do.body
+
+do.body:                                          ; preds = %4, %entry
+; CHECK-LABEL: test_func_i32_two_uses:
+; CHECK: ands [[DSTREG:w[0-9]+]]
+; Usage #1
+; CHECK: cmp [[DSTREG]]
+; Usage #2
+; CHECK: cbz [[DSTREG]]
+  %bit.addr.0 = phi i32 [ %bit, %entry ], [ %shl, %4 ]
+  %retval1.0 = phi i32 [ 0, %entry ], [ %retval1.1, %4 ]
+  %and = and i32 %bit.addr.0, %in
+  %tobool = icmp eq i32 %and, 0
+  %not.tobool = xor i1 %tobool, true
+  %inc = zext i1 %not.tobool to i32
+  %retval1.1 = add nuw nsw i32 %retval1.0, %inc
+  %1 = xor i1 %tobool, true
+  %2 = or i1 %tobool2, %1
+  %dummy = and i32 %mask, %in
+  %use_and = icmp eq i32 %and, %dummy
+  %dummy_or = or i1 %use_and, %2
+  br i1 %dummy_or, label %3, label %4
+
+3:                                                ; preds = %do.body
+  store i32* null, i32** %result, align 8
+  br label %4
+
+4:                                                ; preds = %do.body, %3
+  %shl = shl i32 %bit.addr.0, 1
+  %tobool6 = icmp eq i32 %shl, 0
+  br i1 %tobool6, label %do.end, label %do.body
+
+do.end:                                           ; preds = %4
+  ret i32 %retval1.1
+}
+
+define dso_local i32 @test_func_i64_one_use(i64 %in, i64 %bit, i64 %mask) local_unnamed_addr #0 {
+entry:
+  %0 = load %struct.anon*, %struct.anon** @ptr_wrapper, align 8
+  %result = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1
+  %tobool2 = icmp ne i64 %mask, 0
+  br label %do.body
+
+do.body:                                          ; preds = %4, %entry
+; CHECK-LABEL: test_func_i64_one_use:
+; CHECK: ands [[DSTREG:x[0-9]+]], [[SRCREG1:x[0-9]+]], [[SRCREG2:x[0-9]+]]
+; CHECK-NEXT: orr [[DSTREG]], [[SRCREG_ORR:x[0-9]+]], [[DSTREG]]
+  %bit.addr.0 = phi i64 [ %bit, %entry ], [ %shl, %4 ]
+  %retval1.0 = phi i32 [ 0, %entry ], [ %retval1.1, %4 ]
+  %and = and i64 %bit.addr.0, %in
+  %tobool = icmp eq i64 %and, 0
+  %not.tobool = xor i1 %tobool, true
+  %inc = zext i1 %not.tobool to i32
+  %retval1.1 = add nuw nsw i32 %retval1.0, %inc
+  %1 = xor i1 %tobool, true
+  %2 = or i1 %tobool2, %1
+  br i1 %2, label %3, label %4
+
+3:                                                ; preds = %do.body
+  store i32* null, i32** %result, align 8
+  br label %4
+
+4:                                                ; preds = %do.body, %3
+  %shl = shl i64 %bit.addr.0, 1
+  %tobool6 = icmp eq i64 %shl, 0
+  br i1 %tobool6, label %do.end, label %do.body
+
+do.end:                                           ; preds = %4
+  ret i32 %retval1.1
+}
Index: llvm/test/CodeGen/AArch64/shift-by-signext.ll
===================================================================
--- llvm/test/CodeGen/AArch64/shift-by-signext.ll
+++ llvm/test/CodeGen/AArch64/shift-by-signext.ll
@@ -80,12 +80,11 @@
 define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
 ; CHECK-LABEL: n6_fshl:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsl w8, w0, w2
 ; CHECK-NEXT:    lsr w9, w1, w9
 ; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w0, w8, eq
 ; CHECK-NEXT:    ret
   %shamt_wide = sext i8 %shamt to i32
@@ -95,12 +94,11 @@
 define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind {
 ; CHECK-LABEL: n7_fshr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsr w8, w1, w2
 ; CHECK-NEXT:    lsl w9, w0, w9
 ; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w1, w8, eq
 ; CHECK-NEXT:    ret
   %shamt_wide = sext i8 %shamt to i32