diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5728,6 +5728,32 @@
     if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
       return V;
 
+  // Recognize the following pattern:
+  //
+  // AndVT = (and (sign_extend NarrowVT to AndVT) #bitmask)
+  //
+  // where bitmask is a mask that clear all the upper bits of AndVT
+  // outside the bits of NarrowVT.
+  auto IsAndZeroExtMask = [this](SDValue LHS, SDValue RHS) {
+    if (LHS->getOpcode() != ISD::SIGN_EXTEND)
+      return false;
+
+    auto *C = dyn_cast<ConstantSDNode>(RHS);
+    if (!C || !C->getAPIntValue().isMask())
+      return false;
+
+    EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(),
+                                     C->getAPIntValue().countTrailingOnes());
+    if (NarrowVT != LHS.getOperand(0).getValueType())
+      return false;
+
+    return true;
+  };
+
+  // Replace (and (sign_extend ...) #bitmask) with (zero_extend ...).
+  if (IsAndZeroExtMask(N0, N1))
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0));
+
   return SDValue();
 }
 
@@ -11335,16 +11361,38 @@
       }
   }
 
+  // Finds the pattern implementing the zero extension inreg for
+  // illegal values, which is rendered with an and instruction with a
+  // bit mask. For example, the node for zero extenting the load of an
+  // i8 value into a i32 value is rendered as:
+  //
+  // i32 = (and (load i8) 0xff)
+  auto IsZeroExtInReg = [this](SDNode *N) -> bool {
+    if (N->getOpcode() != ISD::AND)
+      return false;
+
+    auto *AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    auto *LoadN = dyn_cast<LoadSDNode>(N->getOperand(0));
+    if (!AndC || !LoadN)
+      return false;
+
+    EVT LoadResultTy = LoadN->getMemoryVT();
+    EVT ExtVT;
+
+    return isAndLoadExtLoad(AndC, LoadN, LoadResultTy, ExtVT);
+  };
+
   // fold (sext_inreg (extload x)) -> (sextload x)
-  // If sextload is not supported by target, we can only do the combine when
-  // load has one use. Doing otherwise can block folding the extload with other
-  // extends that the target does support.
-  if (ISD::isEXTLoad(N0.getNode()) &&
-      ISD::isUNINDEXEDLoad(N0.getNode()) &&
+  // If sextload is not supported by target, we can only do the
+  // combine when load has one use. Doing otherwise can block folding
+  // the extload with other extends that the target does support. The
+  // folding does not happen if the load is used in a zero extension.
+  if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
       ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
         N0.hasOneUse()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) &&
+      !llvm::any_of(N0->uses(), IsZeroExtInReg)) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5874,9 +5874,9 @@
             isa<ConstantSDNode>(LHS.getOperand(1)) &&
             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
           SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
+          uint64_t SignBitPos = LHS.getConstantOperandVal(1);
           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
+                             DAG.getConstant(Log2_64(SignBitPos), dl, MVT::i64),
                              Dest);
         }
 
@@ -5885,9 +5885,17 @@
         // Don't combine AND since emitComparison converts the AND to an ANDS
         // (a.k.a. TST) and the test in the test bit and branch instruction
         // becomes redundant.  This would also increase register pressure.
-        uint64_t Mask = LHS.getValueSizeInBits() - 1;
+        uint64_t SignBitPos = LHS.getValueSizeInBits() - 1;
+        // If LHS is a sext_inreg, we can check the sign bit of the
+        // original unextended data.
+        if (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+          SignBitPos =
+              cast<VTSDNode>(LHS.getOperand(1))->getVT().getFixedSizeInBits() -
+              1;
+          LHS = LHS.getOperand(0);
+        }
         return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
-                           DAG.getConstant(Mask, dl, MVT::i64), Dest);
+                           DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
       }
     }
     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
@@ -5896,6 +5904,13 @@
       // (a.k.a. TST) and the test in the test bit and branch instruction
       // becomes redundant.  This would also increase register pressure.
       uint64_t Mask = LHS.getValueSizeInBits() - 1;
+      // If LHS is a sext_inreg, we can check the sign bit of the
+      // original unextended data.
+      if (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+        Mask =
+            cast<VTSDNode>(LHS.getOperand(1))->getVT().getFixedSizeInBits() - 1;
+        LHS = LHS.getOperand(0);
+      }
       return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
                          DAG.getConstant(Mask, dl, MVT::i64), Dest);
     }
diff --git a/llvm/test/CodeGen/AArch64/zext-and-signed-compare.ll b/llvm/test/CodeGen/AArch64/zext-and-signed-compare.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/zext-and-signed-compare.ll
@@ -0,0 +1,107 @@
+; RUN: llc -mtriple aarch64-linux-gnu -o -  -asm-verbose=0 < %s | FileCheck %s
+
+; The purpose of the tests `f_*` and `g_*` is to make sure that the
+; zero extension of the load caused by the `zext` instuction is
+; preferred over the signed extension caused by the signed comparison
+; "greater than -1". The effect of prioritizing the zero extension is
+; to avoid the generation of the signed extension of the data being
+; loaded. This is done by making sure that the sign bit of the
+; original unextended data is being checked instead of the sign bit of
+; the sign extended value.
+;
+; The `f_*` and `g_*` differ slightly in their structure to make sure
+; that all the cases that compute the position of the sign bit in
+; AArch64IselLowering.cpp (LowerBR_CC) are covered.
+
+define i32 @f_i32_i8(i8* %p) nounwind {
+; CHECK-LABEL: f_i32_i8:
+; CHECK-NEXT:          ldrb   w[[N:[0-9]+]], [x0]
+; CHECK-NEXT:          tbnz    w[[N]], #7, .LBB[[BB:.*]]
+; CHECK-NEXT:          add    w0, w[[N]], w[[N]]
+; CHECK-NEXT:          ret
+; CHECK-NEXT: .LBB[[BB]]
+; CHECK-NEXT:          mul    w0, w[[N]], w[[N]]
+; CHECK-NEXT:          ret
+entry:
+  %0 = load i8, i8* %p
+  %conv = zext i8 %0 to i32
+  %cmp = icmp sgt i8 %0, -1
+  br i1 %cmp, label %A, label %B
+
+A:
+  %retval2 = add i32 %conv, %conv
+  ret i32 %retval2
+
+B:
+  %retval1 = mul i32 %conv, %conv
+  ret i32 %retval1
+}
+
+define i32 @f_i32_i16(i16* %p) nounwind {
+; CHECK-LABEL: f_i32_i16:
+; CHECK-NEXT:          ldrh   w[[N:[0-9]+]], [x0]
+; CHECK-NEXT:          tbnz    w[[N]], #15, .LBB[[BB:.*]]
+; CHECK-NEXT:          add    w0, w[[N]], w[[N]]
+; CHECK-NEXT:          ret
+; CHECK-NEXT: .LBB[[BB]]
+; CHECK-NEXT:          mul    w0, w[[N]], w[[N]]
+; CHECK-NEXT:          ret
+entry:
+  %0 = load i16, i16* %p
+  %conv = zext i16 %0 to i32
+  %cmp = icmp sgt i16 %0, -1
+  br i1 %cmp, label %A, label %B
+
+A:
+  %retval2 = add i32 %conv, %conv
+  ret i32 %retval2
+
+B:
+  %retval1 = mul i32 %conv, %conv
+  ret i32 %retval1
+}
+
+define i32 @g_i32_i8(i8* %p) nounwind {
+; CHECK-LABEL: g_i32_i8:
+; CHECK-NEXT:          ldrb    w0, [x0]
+; CHECK-NEXT:          tbnz    w0, #7, .LBB[[BB:.*]]
+; CHECK-NEXT:          ret
+; CHECK-NEXT: .LBB[[BB]]
+; CHECK-NEXT:          lsl   w0, w0, #1
+; CHECK-NEXT:          ret
+entry:
+  %0 = load i8, i8* %p, align 1
+  %conv = zext i8 %0 to i32
+  %cmp1 = icmp sgt i8 %0, -1
+  br i1 %cmp1, label %return, label %B
+
+B:                                                ; preds = %entry
+  %add = shl nuw nsw i32 %conv, 1
+  ret i32 %add
+
+return:                                           ; preds = %entry
+  ret i32 %conv
+}
+
+define i32 @g_i32_i16(i16* %p) nounwind {
+; CHECK-LABEL: g_i32_i16:
+; CHECK-NEXT:          ldrh    w0, [x0]
+; CHECK-NEXT:          tbnz    w0, #15, .LBB[[BB:.*]]
+; CHECK-NEXT:          ret
+; CHECK-NEXT: .LBB[[BB]]
+; CHECK-NEXT:          lsl   w0, w0, #1
+; CHECK-NEXT:          ret
+entry:
+  %0 = load i16, i16* %p, align 1
+  %conv = zext i16 %0 to i32
+  %cmp1 = icmp sgt i16 %0, -1
+  br i1 %cmp1, label %return, label %B
+
+B:                                                ; preds = %entry
+  %add = shl nuw nsw i32 %conv, 1
+  ret i32 %add
+
+return:                                           ; preds = %entry
+  ret i32 %conv
+}
+
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
--- a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
@@ -343,8 +343,8 @@
 ;
 ; CHECK-BE-LABEL: and_user:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-BE-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-BE-NEXT:    cmp r0, #1
 ; CHECK-BE-NEXT:    blt .LBB3_4
 ; CHECK-BE-NEXT:  @ %bb.1: @ %for.body.preheader
@@ -355,24 +355,23 @@
 ; CHECK-BE-NEXT:    .p2align 2
 ; CHECK-BE-NEXT:  .LBB3_2: @ %for.body
 ; CHECK-BE-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT:    ldrsh lr, [r3, #2]!
-; CHECK-BE-NEXT:    ldrsh r5, [r2, #2]!
-; CHECK-BE-NEXT:    ldrsh.w r4, [r3, #2]
-; CHECK-BE-NEXT:    ldrsh.w r7, [r2, #2]
-; CHECK-BE-NEXT:    uxth.w r6, lr
-; CHECK-BE-NEXT:    smlabb r5, r5, lr, r12
-; CHECK-BE-NEXT:    smlabb r12, r7, r4, r5
+; CHECK-BE-NEXT:    ldrh lr, [r3, #2]!
+; CHECK-BE-NEXT:    ldrsh r4, [r2, #2]!
+; CHECK-BE-NEXT:    ldrsh.w r5, [r3, #2]
+; CHECK-BE-NEXT:    ldrsh.w r6, [r2, #2]
+; CHECK-BE-NEXT:    smlabb r4, r4, lr, r12
+; CHECK-BE-NEXT:    smlabb r12, r6, r5, r4
 ; CHECK-BE-NEXT:    subs r0, #1
-; CHECK-BE-NEXT:    mul r1, r6, r1
+; CHECK-BE-NEXT:    mul r1, lr, r1
 ; CHECK-BE-NEXT:    bne .LBB3_2
 ; CHECK-BE-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-BE-NEXT:    add.w r0, r12, r1
-; CHECK-BE-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-BE-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-BE-NEXT:  .LBB3_4:
 ; CHECK-BE-NEXT:    mov.w r12, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    add.w r0, r12, r1
-; CHECK-BE-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-BE-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %cmp24 = icmp sgt i32 %arg, 0
   br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
diff --git a/llvm/test/CodeGen/ARM/and-sext-combine.ll b/llvm/test/CodeGen/ARM/and-sext-combine.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/and-sext-combine.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple=arm-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - -O3 \
+; RUN:  -asm-verbose=0 | FileCheck %s
+
+; This tests exerts the folding of `VT = (and (sign_extend NarrowVT to
+; VT) #bitmask)` into `VT = (zero_extend NarrowVT to VT)` when
+; #bitmask value is the mask made by all ones that selects the value
+; of type NarrowVT inside the value of type VT. The folding is
+; implemented in `DAGCombiner::visitAND`.
+
+; With this the folding, the `and` of the "signed extended load" of
+; `%b` in `f_i16_i32` is rendered as a zero extended load.
+
+; CHECK-LABEL: f_i16_i32:
+; CHECK-NEXT: .fnstart
+; CHECK-NEXT: ldrh    r1, [r1]
+; CHECK-NEXT: ldrsh   r0, [r0]
+; CHECK-NEXT: smulbb  r0, r0, r1
+; CHECK-NEXT: mul     r0, r0, r1
+; CHECK-NEXT: bx      lr
+define i32 @f_i16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b) local_unnamed_addr #0 {
+  %1 = load i16, i16* %a, align 2
+  %sext.1 = sext i16 %1 to i32
+  %2 = load i16, i16* %b, align 2
+  %sext.2 = sext i16 %2 to i32
+  %masked = and i32 %sext.2, 65535
+  %mul = mul nsw i32 %sext.2, %sext.1
+  %count.next = mul i32 %mul, %masked
+  ret i32 %count.next
+}
+
diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
--- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
+++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
@@ -82,19 +82,19 @@
 ; ENABLE-NEXT:    bhi .LBB0_7
 ; ENABLE-NEXT:  @ %bb.14: @ %while.body24.preheader
 ; ENABLE-NEXT:    @ in Loop: Header=BB0_7 Depth=1
-; ENABLE-NEXT:    sub r3, r3, #2
+; ENABLE-NEXT:    sub lr, r3, #2
 ; ENABLE-NEXT:  .LBB0_15: @ %while.body24
 ; ENABLE-NEXT:    @ Parent Loop BB0_7 Depth=1
 ; ENABLE-NEXT:    @ => This Inner Loop Header: Depth=2
-; ENABLE-NEXT:    mov r0, r3
-; ENABLE-NEXT:    cmp r3, r2
+; ENABLE-NEXT:    mov r0, lr
+; ENABLE-NEXT:    cmp lr, r2
 ; ENABLE-NEXT:    bls .LBB0_7
 ; ENABLE-NEXT:  @ %bb.16: @ %while.body24.land.rhs14_crit_edge
 ; ENABLE-NEXT:    @ in Loop: Header=BB0_15 Depth=2
-; ENABLE-NEXT:    mov r3, r0
-; ENABLE-NEXT:    ldrsb lr, [r3], #-1
-; ENABLE-NEXT:    cmn lr, #1
-; ENABLE-NEXT:    uxtb r12, lr
+; ENABLE-NEXT:    mov lr, r0
+; ENABLE-NEXT:    ldrb r12, [lr], #-1
+; ENABLE-NEXT:    sxtb r3, r12
+; ENABLE-NEXT:    cmn r3, #1
 ; ENABLE-NEXT:    bgt .LBB0_7
 ; ENABLE-NEXT:  @ %bb.17: @ %while.body24.land.rhs14_crit_edge
 ; ENABLE-NEXT:    @ in Loop: Header=BB0_15 Depth=2
@@ -172,19 +172,19 @@
 ; DISABLE-NEXT:    bhi .LBB0_7
 ; DISABLE-NEXT:  @ %bb.14: @ %while.body24.preheader
 ; DISABLE-NEXT:    @ in Loop: Header=BB0_7 Depth=1
-; DISABLE-NEXT:    sub r3, r3, #2
+; DISABLE-NEXT:    sub lr, r3, #2
 ; DISABLE-NEXT:  .LBB0_15: @ %while.body24
 ; DISABLE-NEXT:    @ Parent Loop BB0_7 Depth=1
 ; DISABLE-NEXT:    @ => This Inner Loop Header: Depth=2
-; DISABLE-NEXT:    mov r0, r3
-; DISABLE-NEXT:    cmp r3, r2
+; DISABLE-NEXT:    mov r0, lr
+; DISABLE-NEXT:    cmp lr, r2
 ; DISABLE-NEXT:    bls .LBB0_7
 ; DISABLE-NEXT:  @ %bb.16: @ %while.body24.land.rhs14_crit_edge
 ; DISABLE-NEXT:    @ in Loop: Header=BB0_15 Depth=2
-; DISABLE-NEXT:    mov r3, r0
-; DISABLE-NEXT:    ldrsb lr, [r3], #-1
-; DISABLE-NEXT:    cmn lr, #1
-; DISABLE-NEXT:    uxtb r12, lr
+; DISABLE-NEXT:    mov lr, r0
+; DISABLE-NEXT:    ldrb r12, [lr], #-1
+; DISABLE-NEXT:    sxtb r3, r12
+; DISABLE-NEXT:    cmn r3, #1
 ; DISABLE-NEXT:    bgt .LBB0_7
 ; DISABLE-NEXT:  @ %bb.17: @ %while.body24.land.rhs14_crit_edge
 ; DISABLE-NEXT:    @ in Loop: Header=BB0_15 Depth=2
diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll
--- a/llvm/test/CodeGen/ARM/select-imm.ll
+++ b/llvm/test/CodeGen/ARM/select-imm.ll
@@ -218,38 +218,65 @@
 ; ARM scheduler emits icmp/zext before both calls, so isn't relevant
 
 ; ARMT2-LABEL: t9:
-; ARMT2: bl f
-; ARMT2: uxtb r0, r4
-; ARMT2: cmp  r0, r0
-; ARMT2: add  r1, r4, #1
-; ARMT2: mov  r2, r0
-; ARMT2: add  r2, r2, #1
-; ARMT2: add  r1, r1, #1
-; ARMT2: uxtb r3, r2
-; ARMT2: cmp  r3, r0
+; ARMT2:       .save   {r4, lr}
+; ARMT2:       push    {r4, lr}
+; ARMT2:       ldrb    r4, [r0]
+; ARMT2:       mov     r0, #1
+; ARMT2:       bl      f
+; ARMT2:       cmp     r4, r4
+; ARMT2:       popne   {r4, pc}
+; ARMT2:    .LBB8_1:
+; ARMT2:       sxtb    r0, r4
+; ARMT2:       add     r0, r0, #1
+; ARMT2:       mov     r1, r4
+; ARMT2:    .LBB8_2:
+; ARMT2:       add     r1, r1, #1
+; ARMT2:       add     r0, r0, #1
+; ARMT2:       uxtb    r2, r1
+; ARMT2:       cmp     r2, r4
+; ARMT2:       blt     .LBB8_2
+; ARMT2:       pop     {r4, pc}
 
 ; THUMB1-LABEL: t9:
-; THUMB1: bl f
-; THUMB1: sxtb r1, r4
-; THUMB1: uxtb r0, r1
-; THUMB1: cmp  r0, r0
-; THUMB1: adds r1, r1, #1
-; THUMB1: mov  r2, r0
-; THUMB1: adds r1, r1, #1
-; THUMB1: adds r2, r2, #1
-; THUMB1: uxtb r3, r2
-; THUMB1: cmp  r3, r0
+; THUMB1:          .save   {r4, lr}
+; THUMB1:          push    {r4, lr}
+; THUMB1:          ldrb    r4, [r0]
+; THUMB1:          movs    r0, #1
+; THUMB1:          bl      f
+; THUMB1:          cmp     r4, r4
+; THUMB1:          bne     .LBB8_3
+; THUMB1:          sxtb    r0, r4
+; THUMB1:          adds    r0, r0, #1
+; THUMB1:          mov     r1, r4
+; THUMB1:  .LBB8_2:
+; THUMB1:          adds    r0, r0, #1
+; THUMB1:          adds    r1, r1, #1
+; THUMB1:          uxtb    r2, r1
+; THUMB1:          cmp     r2, r4
+; THUMB1:          blt     .LBB8_2
+; THUMB1:  .LBB8_3:
+; THUMB1:          pop     {r4, pc}
 
 ; THUMB2-LABEL: t9:
-; THUMB2: bl f
-; THUMB2: uxtb r0, r4
-; THUMB2: cmp  r0, r0
-; THUMB2: adds r1, r4, #1
-; THUMB2: mov  r2, r0
-; THUMB2: adds r2, #1
-; THUMB2: adds r1, #1
-; THUMB2: uxtb r3, r2
-; THUMB2: cmp  r3, r0
+; THUMB2:          .save   {r4, lr}
+; THUMB2:          push    {r4, lr}
+; THUMB2:          ldrb    r4, [r0]
+; THUMB2:          movs    r0, #1
+; THUMB2:          bl      f
+; THUMB2:          cmp     r4, r4
+; THUMB2:          it      ne
+; THUMB2:          popne   {r4, pc}
+; THUMB2:  .LBB8_1:
+; THUMB2:          sxtb    r0, r4
+; THUMB2:          adds    r0, #1
+; THUMB2:          mov     r1, r4
+; THUMB2:  .LBB8_2:
+; THUMB2:          adds    r1, #1
+; THUMB2:          adds    r0, #1
+; THUMB2:          uxtb    r2, r1
+; THUMB2:          cmp     r2, r4
+; THUMB2:          blt     .LBB8_2
+; THUMB2:          pop     {r4, pc}
 
   %0 = load i8, i8* %a
   %conv = sext i8 %0 to i32