Index: llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -126,7 +126,7 @@
                    SmallPtrSetImpl<Instruction*> &SafeToPromote);
   void TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
                      SmallPtrSetImpl<Instruction*> &Sinks);
-  void Cleanup(SmallPtrSetImpl<Instruction*> &Sinks);
+  void Cleanup(SmallPtrSetImpl<Value*> &Visited);
 
 public:
   IRPromoter(Module *M) : M(M), Ctx(M->getContext()),
@@ -180,6 +180,18 @@
          Opc == Instruction::SRem;
 }
 
+static bool EqualTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
+}
+
+static bool LessOrEqualTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() <= ARMCodeGenPrepare::TypeSize;
+}
+
+static bool GreaterThanTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() > ARMCodeGenPrepare::TypeSize;
+}
+
 /// Some instructions can use 8- and 16-bit operands, and we don't need to
 /// promote anything larger. We disallow booleans to make life easier when
 /// dealing with icmps but allow any other integer that is <= 16 bits. Void
@@ -194,11 +206,10 @@
   if (auto *Ld = dyn_cast<LoadInst>(V))
     Ty = cast<PointerType>(Ld->getPointerOperandType())->getElementType();
 
-  const IntegerType *IntTy = dyn_cast<IntegerType>(Ty);
-  if (!IntTy)
+  if (!isa<IntegerType>(Ty))
     return false;
 
-  return IntTy->getBitWidth() == ARMCodeGenPrepare::TypeSize;
+  return LessOrEqualTypeSize(V);
 }
 
 /// Return true if the given value is a source in the use-def chain, producing
@@ -221,7 +232,7 @@
   else if (auto *Call = dyn_cast<CallInst>(V))
     return Call->hasRetAttr(Attribute::AttrKind::ZExt);
   else if (auto *Trunc = dyn_cast<TruncInst>(V))
-    return isSupportedType(Trunc);
+    return EqualTypeSize(Trunc);
   return false;
 }
 
@@ -232,18 +243,15 @@
   // TODO The truncate also isn't actually necessary because we would already
   // proved that the data value is kept within the range of the original data
   // type.
-  auto UsesNarrowValue = [](Value *V) {
-    return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
-  };
 
   if (auto *Store = dyn_cast<StoreInst>(V))
-    return UsesNarrowValue(Store->getValueOperand());
+    return LessOrEqualTypeSize(Store->getValueOperand());
   if (auto *Return = dyn_cast<ReturnInst>(V))
-    return UsesNarrowValue(Return->getReturnValue());
+    return LessOrEqualTypeSize(Return->getReturnValue());
   if (auto *Trunc = dyn_cast<TruncInst>(V))
-    return UsesNarrowValue(Trunc->getOperand(0));
+    return EqualTypeSize(Trunc->getOperand(0));
   if (auto *ZExt = dyn_cast<ZExtInst>(V))
-    return UsesNarrowValue(ZExt->getOperand(0));
+    return GreaterThanTypeSize(ZExt);
   if (auto *ICmp = dyn_cast<ICmpInst>(V))
     return ICmp->isSigned();
 
@@ -649,36 +657,37 @@
       }
     }
   }
-
 }
 
-void IRPromoter::Cleanup(SmallPtrSetImpl<Instruction*> &Sinks) {
-  // Some zext sinks will now have become redundant, along with their trunc
-  // operands, so remove them.
-  for (auto I : Sinks) {
-    if (auto *ZExt = dyn_cast<ZExtInst>(I)) {
-      if (ZExt->getDestTy() != ExtTy)
-        continue;
-
-      Value *Src = ZExt->getOperand(0);
-      if (ZExt->getSrcTy() == ZExt->getDestTy()) {
-        LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary zext\n");
-        ReplaceAllUsersOfWith(ZExt, Src);
-        InstsToRemove.push_back(ZExt);
-        continue;
-      }
+void IRPromoter::Cleanup(SmallPtrSetImpl<Value*> &Visited) {
+  // Some zexts will now have become redundant, along with their trunc
+  // operands, so remove them
+  for (auto V : Visited) {
+    if (!isa<ZExtInst>(V))
+      continue;
+
+    auto ZExt = cast<ZExtInst>(V);
+    if (ZExt->getDestTy() != ExtTy)
+      continue;
+
+    Value *Src = ZExt->getOperand(0);
+    if (ZExt->getSrcTy() == ZExt->getDestTy()) {
+      LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast.\n");
+      ReplaceAllUsersOfWith(ZExt, Src);
+      InstsToRemove.push_back(ZExt);
+      continue;
+    }
 
-      // For any truncs that we insert to handle zexts, we can replace the
-      // result of the zext with the input to the trunc.
-      if (NewInsts.count(Src) && isa<TruncInst>(Src)) {
-        auto *Trunc = cast<TruncInst>(Src);
-        assert(Trunc->getOperand(0)->getType() == ExtTy &&
-               "expected inserted trunc to be operating on i32");
-        LLVM_DEBUG(dbgs() << "ARM CGP: Replacing zext with trunc operand: "
-                   << *Trunc->getOperand(0));
-        ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
-        InstsToRemove.push_back(ZExt);
-      }
+    // For any truncs that we insert to handle zexts, we can replace the
+    // result of the zext with the input to the trunc.
+    if (NewInsts.count(Src) && isa<TruncInst>(Src)) {
+      auto *Trunc = cast<TruncInst>(Src);
+      assert(Trunc->getOperand(0)->getType() == ExtTy &&
+             "expected inserted trunc to be operating on i32");
+      LLVM_DEBUG(dbgs() << "ARM CGP: Replacing zext with trunc operand: "
+                 << *Trunc->getOperand(0));
+      ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
+      InstsToRemove.push_back(ZExt);
     }
   }
 
@@ -728,18 +737,9 @@
 
   // Finally, remove unecessary zexts and truncs, delete old instructions and
   // clear the data structures.
-  Cleanup(Sinks);
+  Cleanup(Visited);
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete:\n");
-  LLVM_DEBUG(dbgs();
-             for (auto *V : Sources)
-               V->dump();
-             for (auto *I : NewInsts)
-               I->dump();
-             for (auto *V : Visited) {
-               if (!Sources.count(V))
-                 V->dump();
-              });
+  LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete\n");
 }
 
 /// We accept most instructions, as well as Arguments and ConstantInsts. We
@@ -747,8 +747,15 @@
 /// return value is zeroext. We don't allow opcodes that can introduce sign
 /// bits.
 bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
-  if (isa<ICmpInst>(V))
-    return true;
+  if (auto *I = dyn_cast<ICmpInst>(V)) {
+    // Now that we allow small types than TypeSize, only allow icmp of
+    // TypeSize because they will require a trunc to be legalised.
+    // TODO: Allow icmp of smaller types, and calculate at the end
+    // whether the transform would be beneficial.
+    if (isa<PointerType>(I->getOperand(0)->getType()))
+      return true;
+    return EqualTypeSize(I->getOperand(0));
+  }
 
   // Memory instructions
   if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
@@ -766,12 +773,11 @@
       isa<LoadInst>(V))
     return isSupportedType(V);
 
-  // Truncs can be either sources or sinks.
-  if (auto *Trunc = dyn_cast<TruncInst>(V))
-    return isSupportedType(Trunc) || isSupportedType(Trunc->getOperand(0));
+  if (isa<SExtInst>(V))
+    return false;
 
-  if (isa<CastInst>(V) && !isa<SExtInst>(V))
-    return isSupportedType(cast<CastInst>(V)->getOperand(0));
+  if (auto *Cast = dyn_cast<CastInst>(V))
+    return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0));
 
   // Special cases for calls as we need to check for zeroext
   // TODO We should accept calls even if they don't have zeroext, as they can
@@ -901,13 +907,17 @@
     // Calls can be both sources and sinks.
     if (isSink(V))
       Sinks.insert(cast<Instruction>(V));
+
     if (isSource(V))
       Sources.insert(V);
-    else if (auto *I = dyn_cast<Instruction>(V)) {
-      // Visit operands of any instruction visited.
-      for (auto &U : I->operands()) {
-        if (!AddLegalInst(U))
-          return false;
+
+    if (!isSink(V) && !isSource(V)) {
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        // Visit operands of any instruction visited.
+        for (auto &U : I->operands()) {
+          if (!AddLegalInst(U))
+            return false;
+        }
       }
     }
 
@@ -973,6 +983,8 @@
         if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType()))
           continue;
 
+        LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
+
         for (auto &Op : CI.operands()) {
           if (auto *I = dyn_cast<Instruction>(Op))
             MadeChange |= TryToPromote(I);
Index: llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
+++ llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
@@ -22,12 +22,10 @@
   ret i16 %conv
 }
 
-; Test that the transformation bails when it finds that i16 is larger than i8.
-; TODO: We should be able to remove the uxtb in these cases.
 ; CHECK-LABEL: promote_i8_sink_i16_1
 ; CHECK: bl dummy_i8
 ; CHECK: add{{.*}} r0, #1
-; CHECK: uxtb r0, r0
+; CHECK-NOT: uxt
 ; CHECK: cmp r0
 define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroext %arg2) {
   %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
Index: llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
+++ llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
@@ -340,8 +340,8 @@
 @sh1 = hidden local_unnamed_addr global i16 0, align 2
 @d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2
 
-; CHECK-LABEL: two_stage_zext_trunc_mix
-; CHECK-NOT: uxt
+; CHECK-COMMON-LABEL: two_stage_zext_trunc_mix
+; CHECK-COMMON-NOT: uxt
 define i8* @two_stage_zext_trunc_mix(i32* %this, i32 %__pos1, i32 %__n1, i32** %__str, i32 %__pos2, i32 %__n2) {
 entry:
   %__size_.i.i.i.i = bitcast i32** %__str to i8*
@@ -363,3 +363,222 @@
   %res = select i1 %tobool.i.i.i.i.i,  i8* %7, i8* %8
   ret i8* %res
 }
+
+; CHECK-COMMON-LABEL: search_through_zext_1
+; CHECK-COMMON-NOT: uxt
+define i8 @search_through_zext_1(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) {
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i16
+  %cmp2 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxtb here. The transform fails because
+; the icmp ugt uses an i32, which is too large... but this doesn't matter
+; because it won't be writing a large value to a register as a result.
+; CHECK-COMMON-LABEL: search_through_zext_2
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: uxtb
+define i8 @search_through_zext_2(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) {
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i32
+  %cmp2 = icmp ugt i32 %conv2, %d
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxtb here as all the calculations are
+; performed on i8s. The promotion of i8 to i16 and then the later truncation
+; results in the uxtb.
+; CHECK-COMMON-LABEL: search_through_zext_3
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: uxtb
+define i8 @search_through_zext_3(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) {
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %conv to i8
+  %sub = sub nuw i8 %b, %trunc
+  %conv2 = zext i8 %sub to i32
+  %cmp2 = icmp ugt i32 %conv2, %d
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxt that gets introduced for %conv2
+; CHECK-COMMON-LABEL: search_through_zext_cmp
+; CHECK-COMMON: uxt
+define i8 @search_through_zext_cmp(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) {
+entry:
+  %cmp = icmp ne i8 %a, %b
+  %conv = zext i1 %cmp to i16
+  %cmp1 = icmp ult i16 %conv, %c
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i16
+  %cmp3 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp3, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; CHECK-COMMON-LABEL: search_through_zext_load
+; CHECK-COMMON-NOT: uxt
+define i8 @search_through_zext_load(i8* %a, i8 zeroext %b, i16 zeroext %c) {
+entry:
+  %load = load i8, i8* %a
+  %conv = zext i8 %load to i16
+  %cmp1 = icmp ult i16 %conv, %c
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %load
+  %conv2 = zext i8 %sub to i16
+  %cmp3 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp3, i8 %load, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than
+; CHECK-COMMON-NOT: uxth
+; CHECK-COMMON: cmp
+; CHECK-COMMON: uxtb
+define i16 @trunc_sink_less_than_cmp(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, 1
+  %cmp2 = icmp ugt i8 %trunc, %add
+  %res = select i1 %cmp2, i16 %a, i16 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %res, %if.then ]
+  ret i16 %retval
+}
+
+; TODO: We should be able to remove the uxth introduced to handle %sub
+; CHECK-COMMON-LABEL: trunc_sink_less_than_arith
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+; CHECK-COMMON: uxtb
+define i16 @trunc_sink_less_than_arith(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  %cmp2 = icmp ugt i8 %e, %add
+  %res = select i1 %cmp2, i16 %a, i16 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %res, %if.then ]
+  ret i16 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than_store
+; CHECK-COMMON-NOT: uxt
+; CHECK-COMMON: cmp
+; CHECK-COMMON-NOT: uxt
+define i16 @trunc_sink_less_than_store(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8* %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  store i8 %add, i8* %e
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %sub, %if.then ]
+  ret i16 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than_ret
+; CHECK-COMMON-NOT: uxt
+define i8 @trunc_sink_less_than_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %add, %if.then ]
+  ret i8 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than_zext_ret
+; CHECK-COMMON-NOT: uxt
+; CHECK-COMMON: cmp
+; CHECK-COMMON: uxtb
+define zeroext i8 @trunc_sink_less_than_zext_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %add, %if.then ]
+  ret i8 %retval
+}
Index: llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
+++ llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
@@ -11,18 +11,17 @@
 ; CHECK-NODSP: sxtb
 ; CHECK-NODSP: cmp
 
-; CHECK-DSP: add
-; CHECK-DSP: uxtb
+; CHECK-DSP: uadd8
+; CHECK-DSP: sub
 ; CHECK-DSP: cmp
 ; CHECK-DSP: sxtb
-; CHECK-DSP: sub
 ; CHECK-DSP: sxtb
 ; CHECK-DSP: cmp
 
 ; CHECK-DSP-IMM: uadd8 [[ADD:r[0-9]+]],
 ; CHECK-DSP-IMM: cmp [[ADD]],
+; CHECK-DSP-IMM: subs [[SUB:r[0-9]+]],
 ; CHECK-DSP-IMM: sxtb [[SEXT0:r[0-9]+]], [[ADD]]
-; CHECK-DSP-IMM: usub8 [[SUB:r[0-9]+]],
 ; CHECK-DSP-IMM: sxtb [[SEXT1:r[0-9]+]], [[SUB]]
 ; CHECK-DSP-IMM: cmp [[SEXT1]], [[SEXT0]]
 define i8 @eq_sgt(i8* %x, i8 *%y, i8 zeroext %z) {