Index: lib/Target/ARM/ARMCodeGenPrepare.cpp
===================================================================
--- lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -247,41 +247,108 @@
   if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
     return true;
 
+  // We can support a, potentially, overflowing instruction (I) if:
+  // - It is only used by an unsigned icmp.
+  // - The icmp uses a constant.
+  // - The overflowing value (I) is decreasing, i.e would underflow - wrapping
+  //   around zero to become a larger number than before.
+  // - The underflowing instruction (I) also uses a constant.
+  //
+  // We can then use the two constants to calculate whether the result would
+  // wrap in respect to itself in the original bitwidth. If it doesn't wrap,
+  // just underflows the range, the icmp would give the same result whether the
+  // result has been truncated or not. We calculate this by:
+  // - Zero extending both constants, if needed, to 32-bits.
+  // - Take the absolute value of I's constant, adding this to the icmp const.
+  // - Check that this value is not out of range for small type. If it is, it
+  //   means that it has underflowed enough to wrap around the icmp constant.
+  //
+  // For example:
+  //
+  // %sub = sub i8 %a, 2
+  // %cmp = icmp ule i8 %sub, 254
+  //
+  // If %a = 0, %sub = -2 == FE == 254
+  // But if this is evalulated as a i32
+  // %sub = -2 == FF FF FF FE == 4294967294
+  // So the unsigned compares (i8 and i32) would not yield the same result.
+  //
+  // Whereas:
+  //
+  // %sub i8 %a, 1
+  // %cmp = icmp ule i8 %sub, 254
+  //
+  // If %a = 0, %sub = -1 == FF == 255
+  // As i32:
+  // %sub = -1 == FF FF FF FF == 4294967295
+  //
+  // In this case, the unsigned compare results would be the same and this
+  // would also be true for ult, uge and ugt:
+  // - (255 < 254) == (0xFFFFFFFF < 254) == false
+  // - (255 <= 254) == (0xFFFFFFFF <= 254) == false
+  // - (255 > 254) == (0xFFFFFFFF > 254) == true
+  // - (255 >= 254) == (0xFFFFFFFF >= 254) == true
+  //
+  // To demonstrate why we can't handle increasing values:
+  // 
+  // %add = add i8 %a, 2
+  // %cmp = icmp ult i8 %add, 127
+  //
+  // If %a = 254, %add = 256 == (i8 1)
+  // As i32:
+  // %add = 256
+  //
+  // (1 < 127) != (256 < 127)
+
   unsigned Opc = I->getOpcode();
-  if (Opc == Instruction::Add || Opc == Instruction::Sub) {
-    // We don't care if the add or sub could wrap if the value is decreasing
-    // and is only being used by an unsigned compare.
-    if (!I->hasOneUse() ||
-        !isa<ICmpInst>(*I->user_begin()) ||
-        !isa<ConstantInt>(I->getOperand(1)))
-      return false;
+  if (Opc != Instruction::Add && Opc != Instruction::Sub)
+    return false;
 
-    auto *CI = cast<ICmpInst>(*I->user_begin());
-    
-    // Don't support an icmp that deals with sign bits, including negative
-    // immediates
-    if (CI->isSigned())
-      return false;
+  if (!I->hasOneUse() ||
+      !isa<ICmpInst>(*I->user_begin()) ||
+      !isa<ConstantInt>(I->getOperand(1)))
+    return false;
 
-    if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(0)))
-      if (Const->isNegative())
-        return false;
+  ConstantInt *OverflowConst = cast<ConstantInt>(I->getOperand(1));
+  bool NegImm = OverflowConst->isNegative();
+  bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
+                       ((Opc == Instruction::Add) && NegImm);
+  if (!IsDecreasing)
+    return false;
 
-    if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(1)))
-      if (Const->isNegative())
-        return false;
+  // Don't support an icmp that deals with sign bits.
+  auto *CI = cast<ICmpInst>(*I->user_begin());
+  if (CI->isSigned() || CI->isEquality())
+    return false;
 
-    bool NegImm = cast<ConstantInt>(I->getOperand(1))->isNegative();
-    bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
-                        ((Opc == Instruction::Add) && NegImm);
-    if (!IsDecreasing)
-      return false;
+  ConstantInt *ICmpConst = nullptr;
+  if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(0)))
+    ICmpConst = Const;
+  else if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(1)))
+    ICmpConst = Const;
+  else
+    return false;
 
-    LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
-    return true;
-  }
+  // Now check that the result can't wrap on itself.
+  APInt Total = ICmpConst->getValue().getBitWidth() < 32 ?
+    ICmpConst->getValue().zext(32) : ICmpConst->getValue();
 
-  return false;
+  Total += OverflowConst->getValue().getBitWidth() < 32 ?
+    OverflowConst->getValue().abs().zext(32) : OverflowConst->getValue().abs();
+
+  APInt Max = APInt::getAllOnesValue(ARMCodeGenPrepare::TypeSize);
+
+  if (Total.getBitWidth() > Max.getBitWidth()) {
+    if (Total.ugt(Max.zext(Total.getBitWidth())))
+      return false;
+  } else if (Max.getBitWidth() > Total.getBitWidth()) {
+    if (Total.zext(Max.getBitWidth()).ugt(Max))
+      return false;
+  } else if (Total.ugt(Max))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
+  return true;
 }
 
 static bool shouldPromote(Value *V) {
@@ -459,6 +526,8 @@
     if (!shouldPromote(V) || isPromotedResultSafe(V))
       continue;
 
+    assert(EnableDSP && "DSP intrinisc insertion not enabled!");
+
     // Replace unsafe instructions with appropriate intrinsic calls.
     InsertDSPIntrinsic(cast<Instruction>(V));
   }
Index: test/CodeGen/ARM/arm-cgp-overflow.ll
===================================================================
--- test/CodeGen/ARM/arm-cgp-overflow.ll
+++ test/CodeGen/ARM/arm-cgp-overflow.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s
 
 ; CHECK: overflow_add
 ; CHECK: add
@@ -47,3 +47,134 @@
   %res = select i1 %cmp, i16 2, i16 5
   ret i16 %res
 }
+
+; CHECK-LABEL: overflow_add_no_consts:
+; CHECK:  add r0, r1
+; CHECK:  uxtb [[EXT:r[0-9]+]], r0
+; CHECK:  cmp [[EXT]], r2
+; CHECK:  movhi r0, #8
+define i32 @overflow_add_no_consts(i8 zeroext %a, i8 zeroext %b, i8 zeroext %limit) {
+  %add = add i8 %a, %b
+  %cmp = icmp ugt i8 %add, %limit
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; CHECK-LABEL: overflow_add_const_limit:
+; CHECK:  add r0, r1
+; CHECK:  uxtb [[EXT:r[0-9]+]], r0
+; CHECK:  cmp [[EXT]], #128
+; CHECK:  movhi r0, #8
+define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) {
+  %add = add i8 %a, %b
+  %cmp = icmp ugt i8 %add, 128
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; CHECK-LABEL: overflow_add_positive_const_limit:
+; CHECK:  adds r0, #1
+; CHECK:  uxtb [[EXT:r[0-9]+]], r0
+; CHECK:  cmp [[EXT]], #128
+; CHECK:  movhi r0, #8
+define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
+  %add = add i8 %a, 1
+  %cmp = icmp ugt i8 %add, 128
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; CHECK-LABEL: unsafe_add_underflow:
+; CHECK:  subs r0, #2
+; CHECK:  uxtb [[EXT:r[0-9]+]], r0
+; CHECK:  cmp [[EXT]], #255
+; CHECK:  moveq r0, #8
+define i32 @unsafe_add_underflow(i8 zeroext %a) {
+  %add = add i8 %a, -2
+  %cmp = icmp ugt i8 %add, 254
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; CHECK-LABEL: safe_add_underflow:
+; CHECK:      subs [[MINUS_1:r[0-9]+]], r0, #1
+; CHECK-NOT:  uxtb
+; CHECK:      cmp [[MINUS_1]], #254
+; CHECK:      movhi r0, #8
+define i32 @safe_add_underflow(i8 zeroext %a) {
+  %add = add i8 %a, -1
+  %cmp = icmp ugt i8 %add, 254
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; CHECK-LABEL: safe_add_underflow_neg:
+; CHECK:      subs [[MINUS_1:r[0-9]+]], r0, #2
+; CHECK-NOT:  uxtb
+; CHECK:      cmp [[MINUS_1]], #251
+; CHECK:      movlo r0, #8
+define i32 @safe_add_underflow_neg(i8 zeroext %a) {
+  %add = add i8 %a, -2
+  %cmp = icmp ule i8 %add, -6
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; CHECK-LABEL: overflow_sub_negative_const_limit:
+; CHECK:  adds r0, #1
+; CHECK:  uxtb [[EXT:r[0-9]+]], r0
+; CHECK:  cmp [[EXT]], #128
+; CHECK:  movhi r0, #8
+define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) {
+  %sub = sub i8 %a, -1
+  %cmp = icmp ugt i8 %sub, 128
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; CHECK-LABEL: unsafe_sub_underflow:
+; CHECK:  subs r0, #6
+; CHECK:  uxtb [[EXT:r[0-9]+]], r0
+; CHECK:  cmp [[EXT]], #250
+; CHECK:  movhi r0, #8
+define i32 @unsafe_sub_underflow(i8 zeroext %a) {
+  %sub = sub i8 %a, 6
+  %cmp = icmp ugt i8 %sub, 250
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; CHECK-LABEL: safe_sub_underflow:
+; CHECK:      subs [[MINUS_1:r[0-9]+]], r0, #1
+; CHECK-NOT:  uxtb
+; CHECK:      cmp [[MINUS_1]], #255
+; CHECK:      movlo r0, #8
+define i32 @safe_sub_underflow(i8 zeroext %a) {
+  %sub = sub i8 %a, 1
+  %cmp = icmp ule i8 %sub, 254
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; CHECK-LABEL: safe_sub_underflow_neg
+; CHECK:      subs [[MINUS_1:r[0-9]+]], r0, #4
+; CHECK-NOT:  uxtb
+; CHECK:      cmp [[MINUS_1]], #250
+; CHECK:      movhi r0, #8
+define i32 @safe_sub_underflow_neg(i8 zeroext %a) {
+  %sub = sub i8 %a, 4
+  %cmp = icmp uge i8 %sub, -5
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; CHECK:  subs r0, #4
+; CHECK:  uxtb [[EXT:r[0-9]+]], r0
+; CHECK:  cmp [[EXT]], #253
+; CHECK:  movlo r0, #8
+define i32 @unsafe_sub_underflow_neg(i8 zeroext %a) {
+  %sub = sub i8 %a, 4
+  %cmp = icmp ult i8 %sub, -3
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
Index: test/CodeGen/ARM/arm-cgp-signed-icmps.ll
===================================================================
--- test/CodeGen/ARM/arm-cgp-signed-icmps.ll
+++ test/CodeGen/ARM/arm-cgp-signed-icmps.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 -arm-disable-cgp=false -mattr=-use-misched %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
+; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 -arm-disable-cgp=false -mattr=-use-misched %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
 ; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP
 ; RUN: llc -mtriple=thumbv8 %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM
 
@@ -47,13 +47,22 @@
 ; CHECK-NODSP: cmp
 ; CHECK-NODSP: cmp
 
-; CHECK-DSP: sxth [[ARG:r[0-9]+]], r2
-; CHECK-DSP: subs [[SUB:r[0-9]+]],
-; CHECK-DSP: uadd16 [[ADD:r[0-9]+]],
-; CHECK-DSP: sxth.w [[SEXT:r[0-9]+]], [[ADD]]
-; CHECK-DSP: cmp [[SEXT]], [[ARG]]
-; CHECK-DSP-NOT: uxt
-; CHECK-DSP: cmp [[SUB]], r2
+; CHECK-DSP: sub
+; CHECK-DSP: sxth
+; CHECK-DSP: add
+; CHECK-DSP: uxth
+; CHECK-DSP: sxth
+; CHECK-DSP: cmp
+; CHECK-DSP: cmp
+
+; CHECK-DSP-IMM: sxth [[ARG:r[0-9]+]], r2
+; CHECK-DSP-IMM: uadd16 [[ADD:r[0-9]+]],
+; CHECK-DSP-IMM: sxth.w [[SEXT:r[0-9]+]], [[ADD]]
+; CHECK-DSP-IMM: cmp [[SEXT]], [[ARG]]
+; CHECK-DSP-IMM-NOT: uxt
+; CHECK-DSP-IMM: movs [[ONE:r[0-9]+]], #1
+; CHECK-DSP-IMM: usub16 [[SUB:r[0-9]+]], r1, [[ONE]]
+; CHECK-DSP-IMM: cmp [[SUB]], r2
 define i16 @ugt_slt(i16 *%x, i16 zeroext %y, i16 zeroext %z) {
 entry:
   %load0 = load i16, i16* %x, align 1
Index: test/CodeGen/ARM/pr39060.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/pr39060.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=armv7a-linux-androideabi %s -o - | FileCheck %s
+
+@a = local_unnamed_addr global i16 -1, align 2
+@b = local_unnamed_addr global i16 0, align 2
+
+; CHECK-LABEL: pr39060:
+; CHECK: ldrh
+; CHECK: ldrh
+; CHECK: sub
+; CHECK: uxth
+define void @pr39060() local_unnamed_addr #0 {
+entry:
+  %0 = load i16, i16* @a, align 2
+  %1 = load i16, i16* @b, align 2
+  %sub = add i16 %1, -1
+  %cmp = icmp eq i16 %0, %sub
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:
+  tail call void bitcast (void (...)* @f to void ()*)() #2
+  br label %if.end
+
+if.else:
+  tail call void bitcast (void (...)* @g to void ()*)() #2
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @f(...) local_unnamed_addr #1
+
+declare void @g(...) local_unnamed_addr #1