diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11279,16 +11279,53 @@
       }
   }
 
+  // Finds the pattern implememting the zero extension inreg for
+  // illegal values, which is rendered with an and instruction with a
+  // bit mask. For example, the node for zero extenting the load of an
+  // i8 value into a i32 value is rendered as:
+  //
+  // i32 = (and (load i8) 0xff)
+  auto IsZeroExtInReg = [this](SDNode *N) -> bool {
+    if (N->getOpcode() != ISD::AND)
+      return false;
+
+    auto *AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    auto *LoadN = dyn_cast<LoadSDNode>(N->getOperand(0));
+    if (!AndC || !LoadN)
+      return false;
+
+    EVT LoadResultTy = LoadN->getMemoryVT();
+    EVT ExtVT;
+
+    return isAndLoadExtLoad(AndC, LoadN, LoadResultTy, ExtVT);
+  };
+
+  // Checks if the two uses of a load are extensions that differ in
+  // signedness.
+  auto UsesDifferInSignExtension = [&IsZeroExtInReg](LoadSDNode *Load) -> bool {
+    if (Load->use_size() != 2)
+      return false;
+
+    SDNode::use_iterator UseIt = Load->use_begin();
+    SDNode *UseOne = *UseIt;
+    SDNode *UseTwo = *++UseIt;
+
+    if (UseOne->getOpcode() == ISD::SIGN_EXTEND_INREG && IsZeroExtInReg(UseTwo))
+      return true;
+
+    return false;
+  };
+
   // fold (sext_inreg (extload x)) -> (sextload x)
   // If sextload is not supported by target, we can only do the combine when
   // load has one use. Doing otherwise can block folding the extload with other
   // extends that the target does support.
-  if (ISD::isEXTLoad(N0.getNode()) &&
-      ISD::isUNINDEXEDLoad(N0.getNode()) &&
+  if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
       ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
         N0.hasOneUse()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) &&
+      !UsesDifferInSignExtension(cast<LoadSDNode>(N0))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5874,9 +5874,9 @@
             isa<ConstantSDNode>(LHS.getOperand(1)) &&
             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
           SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
+          uint64_t SignBitPos = LHS.getConstantOperandVal(1);
           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
+                             DAG.getConstant(Log2_64(SignBitPos), dl, MVT::i64),
                              Dest);
         }
 
@@ -5885,9 +5885,17 @@
         // Don't combine AND since emitComparison converts the AND to an ANDS
         // (a.k.a. TST) and the test in the test bit and branch instruction
         // becomes redundant.  This would also increase register pressure.
-        uint64_t Mask = LHS.getValueSizeInBits() - 1;
+        uint64_t SignBitPos = LHS.getValueSizeInBits() - 1;
+        // If LHS is a sext_inreg, we can check the sign bit of the
+        // original unextended data.
+        if (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+          SignBitPos =
+              cast<VTSDNode>(LHS.getOperand(1))->getVT().getFixedSizeInBits() -
+              1;
+          LHS = LHS.getOperand(0);
+        }
         return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
-                           DAG.getConstant(Mask, dl, MVT::i64), Dest);
+                           DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
       }
     }
     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
@@ -5896,6 +5904,13 @@
       // (a.k.a. TST) and the test in the test bit and branch instruction
       // becomes redundant.  This would also increase register pressure.
       uint64_t Mask = LHS.getValueSizeInBits() - 1;
+      // If LHS is a sext_inreg, we can check the sign bit of the
+      // original unextended data.
+      if (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+        Mask =
+            cast<VTSDNode>(LHS.getOperand(1))->getVT().getFixedSizeInBits() - 1;
+        LHS = LHS.getOperand(0);
+      }
       return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
                          DAG.getConstant(Mask, dl, MVT::i64), Dest);
     }
diff --git a/llvm/test/CodeGen/AArch64/zext-and-signed-compare.ll b/llvm/test/CodeGen/AArch64/zext-and-signed-compare.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/zext-and-signed-compare.ll
@@ -0,0 +1,94 @@
+; RUN: llc -mtriple aarch64-linux-gnu -o -  -asm-verbose=0 < %s | FileCheck %s
+
+define i32 @f_i32_i8(i8* %p) nounwind {
+; CHECK-LABEL: f_i32_i8:
+; CHECK-NEXT:          ldrb   w[[N:[0-9]+]], [x0]
+; CHECK-NEXT:          tbnz    w[[N]], #7, .LBB[[BB:.*]]
+; CHECK-NEXT:          add    w0, w[[N]], w[[N]]
+; CHECK-NEXT:          ret
+; CHECK-NEXT: .LBB[[BB]]
+; CHECK-NEXT:          mul    w0, w[[N]], w[[N]]
+; CHECK-NEXT:          ret
+entry:
+  %0 = load i8, i8* %p
+  %conv = zext i8 %0 to i32
+  %cmp = icmp sgt i8 %0, -1
+  br i1 %cmp, label %A, label %B
+
+A:
+  %retval2 = add i32 %conv, %conv
+  ret i32 %retval2
+
+B:
+  %retval1 = mul i32 %conv, %conv
+  ret i32 %retval1
+}
+
+define i32 @f_i32_i16(i16* %p) nounwind {
+; CHECK-LABEL: f_i32_i16:
+; CHECK-NEXT:          ldrh   w[[N:[0-9]+]], [x0]
+; CHECK-NEXT:          tbnz    w[[N]], #15, .LBB[[BB:.*]]
+; CHECK-NEXT:          add    w0, w[[N]], w[[N]]
+; CHECK-NEXT:          ret
+; CHECK-NEXT: .LBB[[BB]]
+; CHECK-NEXT:          mul    w0, w[[N]], w[[N]]
+; CHECK-NEXT:          ret
+entry:
+  %0 = load i16, i16* %p
+  %conv = zext i16 %0 to i32
+  %cmp = icmp sgt i16 %0, -1
+  br i1 %cmp, label %A, label %B
+
+A:
+  %retval2 = add i32 %conv, %conv
+  ret i32 %retval2
+
+B:
+  %retval1 = mul i32 %conv, %conv
+  ret i32 %retval1
+}
+
+define i32 @g_i32_i8(i8* %p) nounwind {
+; CHECK-LABEL: g_i32_i8:
+; CHECK-NEXT:          ldrb    w0, [x0]
+; CHECK-NEXT:          tbnz    w0, #7, .LBB[[BB:.*]]
+; CHECK-NEXT:          ret
+; CHECK-NEXT: .LBB[[BB]]
+; CHECK-NEXT:          lsl   w0, w0, #1
+; CHECK-NEXT:          ret
+entry:
+  %0 = load i8, i8* %p, align 1
+  %conv = zext i8 %0 to i32
+  %cmp1 = icmp sgt i8 %0, -1
+  br i1 %cmp1, label %return, label %B
+
+B:                                                ; preds = %entry
+  %add = shl nuw nsw i32 %conv, 1
+  ret i32 %add
+
+return:                                           ; preds = %entry
+  ret i32 %conv
+}
+
+define i32 @g_i32_i16(i16* %p) nounwind {
+; CHECK-LABEL: g_i32_i16:
+; CHECK-NEXT:          ldrh    w0, [x0]
+; CHECK-NEXT:          tbnz    w0, #15, .LBB[[BB:.*]]
+; CHECK-NEXT:          ret
+; CHECK-NEXT: .LBB[[BB]]
+; CHECK-NEXT:          lsl   w0, w0, #1
+; CHECK-NEXT:          ret
+entry:
+  %0 = load i16, i16* %p, align 1
+  %conv = zext i16 %0 to i32
+  %cmp1 = icmp sgt i16 %0, -1
+  br i1 %cmp1, label %return, label %B
+
+B:                                                ; preds = %entry
+  %add = shl nuw nsw i32 %conv, 1
+  ret i32 %add
+
+return:                                           ; preds = %entry
+  ret i32 %conv
+}
+
diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
--- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
+++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
@@ -82,19 +82,19 @@
 ; ENABLE-NEXT:    bhi .LBB0_7
 ; ENABLE-NEXT:  @ %bb.14: @ %while.body24.preheader
 ; ENABLE-NEXT:    @ in Loop: Header=BB0_7 Depth=1
-; ENABLE-NEXT:    sub r3, r3, #2
+; ENABLE-NEXT:    sub lr, r3, #2
 ; ENABLE-NEXT:  .LBB0_15: @ %while.body24
 ; ENABLE-NEXT:    @ Parent Loop BB0_7 Depth=1
 ; ENABLE-NEXT:    @ => This Inner Loop Header: Depth=2
-; ENABLE-NEXT:    mov r0, r3
-; ENABLE-NEXT:    cmp r3, r2
+; ENABLE-NEXT:    mov r0, lr
+; ENABLE-NEXT:    cmp lr, r2
 ; ENABLE-NEXT:    bls .LBB0_7
 ; ENABLE-NEXT:  @ %bb.16: @ %while.body24.land.rhs14_crit_edge
 ; ENABLE-NEXT:    @ in Loop: Header=BB0_15 Depth=2
-; ENABLE-NEXT:    mov r3, r0
-; ENABLE-NEXT:    ldrsb lr, [r3], #-1
-; ENABLE-NEXT:    cmn lr, #1
-; ENABLE-NEXT:    uxtb r12, lr
+; ENABLE-NEXT:    mov lr, r0
+; ENABLE-NEXT:    ldrb r12, [lr], #-1
+; ENABLE-NEXT:    sxtb r3, r12
+; ENABLE-NEXT:    cmn r3, #1
 ; ENABLE-NEXT:    bgt .LBB0_7
 ; ENABLE-NEXT:  @ %bb.17: @ %while.body24.land.rhs14_crit_edge
 ; ENABLE-NEXT:    @ in Loop: Header=BB0_15 Depth=2
@@ -172,19 +172,19 @@
 ; DISABLE-NEXT:    bhi .LBB0_7
 ; DISABLE-NEXT:  @ %bb.14: @ %while.body24.preheader
 ; DISABLE-NEXT:    @ in Loop: Header=BB0_7 Depth=1
-; DISABLE-NEXT:    sub r3, r3, #2
+; DISABLE-NEXT:    sub lr, r3, #2
 ; DISABLE-NEXT:  .LBB0_15: @ %while.body24
 ; DISABLE-NEXT:    @ Parent Loop BB0_7 Depth=1
 ; DISABLE-NEXT:    @ => This Inner Loop Header: Depth=2
-; DISABLE-NEXT:    mov r0, r3
-; DISABLE-NEXT:    cmp r3, r2
+; DISABLE-NEXT:    mov r0, lr
+; DISABLE-NEXT:    cmp lr, r2
 ; DISABLE-NEXT:    bls .LBB0_7
 ; DISABLE-NEXT:  @ %bb.16: @ %while.body24.land.rhs14_crit_edge
 ; DISABLE-NEXT:    @ in Loop: Header=BB0_15 Depth=2
-; DISABLE-NEXT:    mov r3, r0
-; DISABLE-NEXT:    ldrsb lr, [r3], #-1
-; DISABLE-NEXT:    cmn lr, #1
-; DISABLE-NEXT:    uxtb r12, lr
+; DISABLE-NEXT:    mov lr, r0
+; DISABLE-NEXT:    ldrb r12, [lr], #-1
+; DISABLE-NEXT:    sxtb r3, r12
+; DISABLE-NEXT:    cmn r3, #1
 ; DISABLE-NEXT:    bgt .LBB0_7
 ; DISABLE-NEXT:  @ %bb.17: @ %while.body24.land.rhs14_crit_edge
 ; DISABLE-NEXT:    @ in Loop: Header=BB0_15 Depth=2