Index: lib/Target/ARM/ARMCodeGenPrepare.cpp
===================================================================
--- lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -85,16 +85,15 @@
   const ARMSubtarget *ST = nullptr;
   IRPromoter *Promoter = nullptr;
   std::set<Value*> AllVisited;
-  Type *OrigTy = nullptr;
-  unsigned TypeSize = 0;
 
-  bool isNarrowInstSupported(Instruction *I);
   bool isSupportedValue(Value *V);
   bool isLegalToPromote(Value *V);
   bool TryToPromote(Value *V);
 
 public:
   static char ID;
+  static unsigned TypeSize;
+  Type *OrigTy = nullptr;
 
   ARMCodeGenPrepare() : FunctionPass(ID) {}
 
@@ -126,65 +125,66 @@
 /// dealing with icmps but allow any other integer that is <= 16 bits. Void
 /// types are accepted so we can handle switches.
 static bool isSupportedType(Value *V) {
-  if (V->getType()->isVoidTy())
+  LLVM_DEBUG(dbgs() << "ARM CGP: isSupportedType: " << *V << "\n");
+  Type *Ty = V->getType();
+  if (Ty->isVoidTy())
     return true;
 
-  const IntegerType *IntTy = dyn_cast<IntegerType>(V->getType());
-  if (!IntTy)
-    return false;
+  if (auto *Ld = dyn_cast<LoadInst>(V))
+    Ty = cast<PointerType>(Ld->getPointerOperandType())->getElementType();
 
-  // Don't try to promote boolean values.
-  if (IntTy->getBitWidth() == 1)
+  const IntegerType *IntTy = dyn_cast<IntegerType>(Ty);
+  if (!IntTy) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: No, not an integer.\n");
     return false;
+  }
 
-  if (auto *ZExt = dyn_cast<ZExtInst>(V))
-    return isSupportedType(ZExt->getOperand(0));
+  return IntTy->getBitWidth() == ARMCodeGenPrepare::TypeSize;
+}
 
-  return IntTy->getBitWidth() <= 16;
+/// Return true if the given value is a leaf in the use-def chain, producing
+/// a narrow (i8, i16) value. These values will be zext to start the promotion
+/// of the tree to i32. We guarantee that these won't populate the upper bits
+/// of the register. ZExt on the loads will be free, and the same for call
+/// return values because we only accept ones that guarantee a zeroext ret val.
+/// Many arguments will have the zeroext attribute too, so those would be free
+/// too.
+static bool isSource(Value *V) {
+  // TODO Allow truncs and zext to be sources.
+  if (isa<Argument>(V))
+    return true;
+  else if (isa<LoadInst>(V))
+    return true;
+  else if (auto *Call = dyn_cast<CallInst>(V))
+    return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+  return false;
 }
 
 /// Return true if V will require any promoted values to be truncated for the
-/// use to be valid.
+/// the IR to remain valid. We can't mutate the value type of these
+/// instructions.
 static bool isSink(Value *V) {
+  // TODO The truncate also isn't actually necessary because we would already
+  // proved that the data value is kept within the range of the original data
+  // type.
   auto UsesNarrowValue = [](Value *V) {
-    return V->getType()->getScalarSizeInBits() <= 32;
+    return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
   };
 
   if (auto *Store = dyn_cast<StoreInst>(V))
     return UsesNarrowValue(Store->getValueOperand());
   if (auto *Return = dyn_cast<ReturnInst>(V))
     return UsesNarrowValue(Return->getReturnValue());
+  if (auto *Trunc = dyn_cast<TruncInst>(V))
+    return UsesNarrowValue(Trunc->getOperand(0));
 
   return isa<CallInst>(V);
 }
 
-/// Return true if the given value is a leaf that will need to be zext'd.
-static bool isSource(Value *V) {
-  if (isa<Argument>(V) && isSupportedType(V))
-    return true;
-  else if (isa<TruncInst>(V))
-    return true;
-  else if (auto *ZExt = dyn_cast<ZExtInst>(V))
-    // ZExt can be a leaf if its the only user of a load.
-    return isa<LoadInst>(ZExt->getOperand(0)) &&
-                         ZExt->getOperand(0)->hasOneUse();
-  else if (auto *Call = dyn_cast<CallInst>(V))
-    return Call->hasRetAttr(Attribute::AttrKind::ZExt);
-  else if (auto *Load = dyn_cast<LoadInst>(V)) {
-    if (!isa<IntegerType>(Load->getType()))
-      return false;
-    // A load is a leaf, unless its already just being zext'd.
-    if (Load->hasOneUse() && isa<ZExtInst>(*Load->use_begin()))
-      return false;
-
-    return true;
-  }
-  return false;
-}
-
 /// Return whether the instruction can be promoted within any modifications to
 /// it's operands or result.
 static bool isSafeOverflow(Instruction *I) {
+  // FIXME Do we need NSW too?
   if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
     return true;
 
@@ -222,19 +222,18 @@
 }
 
 static bool shouldPromote(Value *V) {
-  auto *I = dyn_cast<Instruction>(V);
-  if (!I)
+  if (!isa<IntegerType>(V->getType()) || isSink(V))
     return false;
 
-  if (!isa<IntegerType>(V->getType()))
-    return false;
+  if (isSource(V))
+    return true;
 
-  if (isa<StoreInst>(I) || isa<TerminatorInst>(I) || isa<TruncInst>(I) ||
-      isa<ICmpInst>(I))
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
     return false;
 
-  if (auto *ZExt = dyn_cast<ZExtInst>(I))
-    return !ZExt->getDestTy()->isIntegerTy(32);
+  if (isa<ICmpInst>(I))
+    return false;
 
   return true;
 }
@@ -262,7 +261,7 @@
 /// Return the intrinsic for the instruction that can perform the same
 /// operation but on a narrow type. This is using the parallel dsp intrinsics
 /// on scalar values.
-static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) {
+static Intrinsic::ID getNarrowIntrinsic(Instruction *I) {
   // Whether we use the signed or unsigned versions of these intrinsics
   // doesn't matter because we're not using the GE bits that they set in
   // the APSR.
@@ -270,10 +269,10 @@
   default:
     break;
   case Instruction::Add:
-    return TypeSize == 16 ? Intrinsic::arm_uadd16 :
+    return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_uadd16 :
       Intrinsic::arm_uadd8;
   case Instruction::Sub:
-    return TypeSize == 16 ? Intrinsic::arm_usub16 :
+    return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_usub16 :
       Intrinsic::arm_usub8;
   }
   llvm_unreachable("unhandled opcode for narrow intrinsic");
@@ -285,10 +284,9 @@
                         SmallPtrSetImpl<Instruction*> &Roots) {
   IRBuilder<> Builder{Ctx};
   Type *ExtTy = Type::getInt32Ty(M->getContext());
-  unsigned TypeSize = OrigTy->getPrimitiveSizeInBits();
   SmallPtrSet<Value*, 8> Promoted;
-  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << TypeSize
-        << " to 32-bits\n");
+  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
+             << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
 
   auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) {
     SmallVector<Instruction*, 4> Users;
@@ -325,7 +323,7 @@
     LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
                << *I << "\n");
     Function *DSPInst =
-      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I, TypeSize));
+      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
     Builder.SetInsertPoint(I);
     Builder.SetCurrentDebugLocation(I->getDebugLoc());
     Value *Args[] = { I->getOperand(0), I->getOperand(1) };
@@ -353,9 +351,7 @@
   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting leaves:\n");
   for (auto V : Leaves) {
     LLVM_DEBUG(dbgs() << " - " << *V << "\n");
-    if (auto *ZExt = dyn_cast<ZExtInst>(V))
-      ZExt->mutateType(ExtTy);
-    else if (auto *I = dyn_cast<Instruction>(V))
+    if (auto *I = dyn_cast<Instruction>(V))
       InsertZExt(I, I);
     else if (auto *Arg = dyn_cast<Argument>(V)) {
       BasicBlock &BB = Arg->getParent()->front();
@@ -401,17 +397,9 @@
   for (auto *V : Visited) {
     if (Leaves.count(V))
       continue;
-    if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
-      if (ZExt->getDestTy() != ExtTy) {
-        ZExt->mutateType(ExtTy);
-        Promoted.insert(ZExt);
-      }
-      else if (ZExt->getSrcTy() == ExtTy) {
-        ReplaceAllUsersOfWith(V, ZExt->getOperand(0));
-        InstsToRemove.push_back(ZExt);
-      }
+
+    if (!isa<Instruction>(V))
       continue;
-    }
 
     if (!shouldPromote(V) || isPromotedResultSafe(V))
       continue;
@@ -459,30 +447,6 @@
   LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete.\n");
 }
 
-bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) {
-  if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
-    return false;
-
-  if (ST->isThumb() && !ST->hasThumb2())
-    return false;
-
-  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
-    return false;
-
-  // TODO
-  // Would it be profitable? For Thumb code, these parallel DSP instructions
-  // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
-  // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
-  // halved. They also do not take immediates as operands.
-  for (auto &Op : I->operands()) {
-    if (isa<Constant>(Op)) {
-      if (!EnableDSPWithImms)
-        return false;
-    }
-  }
-  return true;
-}
-
 /// We accept most instructions, as well as Arguments and ConstantInsts. We
 /// Disallow casts other than zext and truncs and only allow calls if their
 /// return value is zeroext. We don't allow opcodes that can introduce sign
@@ -490,42 +454,42 @@
 bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
   LLVM_DEBUG(dbgs() << "ARM CGP: Is " << *V << " supported?\n");
 
-  // Non-instruction values that we can handle.
-  if (isa<ConstantInt>(V) || isa<Argument>(V))
-    return true;
+  if (auto *ICmp = dyn_cast<ICmpInst>(V))
+    return ICmp->isEquality() || !ICmp->isSigned();
 
   // Memory instructions
-  if (isa<StoreInst>(V) || isa<LoadInst>(V) || isa<GetElementPtrInst>(V))
+  if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
     return true;
 
   // Branches and targets.
-  if (auto *ICmp = dyn_cast<ICmpInst>(V))
-    return ICmp->isEquality() || !ICmp->isSigned();
-
   if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
     return true;
 
-  if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V))
-    return true;
+  // Non-instruction values that we can handle.
+  if (isa<ConstantInt>(V) || isa<Argument>(V))
+    return isSupportedType(V);
+
+  if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V) ||
+      isa<LoadInst>(V))
+    return isSupportedType(V);
+
+  // Currently, Trunc is the only cast we support.
+  if (auto *Trunc = dyn_cast<TruncInst>(V))
+    return isSupportedType(Trunc->getOperand(0));
 
   // Special cases for calls as we need to check for zeroext
   // TODO We should accept calls even if they don't have zeroext, as they can
   // still be roots.
   if (auto *Call = dyn_cast<CallInst>(V))
-    return Call->hasRetAttr(Attribute::AttrKind::ZExt);
-  else if (auto *Cast = dyn_cast<CastInst>(V)) {
-    if (isa<ZExtInst>(Cast))
-      return Cast->getDestTy()->getScalarSizeInBits() <= 32;
-    else if (auto *Trunc = dyn_cast<TruncInst>(V))
-      return Trunc->getDestTy()->getScalarSizeInBits() <= TypeSize;
-    else {
-      LLVM_DEBUG(dbgs() << "ARM CGP: No, unsupported cast.\n");
-      return false;
-    }
-  } else if (!isa<BinaryOperator>(V)) {
+    return isSupportedType(Call) &&
+           Call->hasRetAttr(Attribute::AttrKind::ZExt);
+
+  if (!isa<BinaryOperator>(V)) {
     LLVM_DEBUG(dbgs() << "ARM CGP: No, not a binary operator.\n");
     return false;
   }
+  if (!isSupportedType(V))
+    return false;
 
   bool res = !isSigned(V);
   if (!res)
@@ -537,39 +501,49 @@
 /// smaller than the targeted promoted type. Check that we're not trying to
 /// promote something larger than our base 'TypeSize' type.
 bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
-  if (!isSupportedType(V))
-    return false;
+  if (isPromotedResultSafe(V))
+    return true;
 
-  unsigned VSize = 0;
-  if (auto *Ld = dyn_cast<LoadInst>(V)) {
-    auto *PtrTy = cast<PointerType>(Ld->getPointerOperandType());
-    VSize = PtrTy->getElementType()->getPrimitiveSizeInBits();
-  } else if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
-    VSize = ZExt->getOperand(0)->getType()->getPrimitiveSizeInBits();
-  } else {
-    VSize = V->getType()->getPrimitiveSizeInBits();
-  }
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
 
-  if (VSize > TypeSize)
+  // If promotion is not safe, can we use a DSP instruction to natively
+  // handle the narrow type?
+  if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
     return false;
 
-  if (isPromotedResultSafe(V))
-    return true;
+  if (ST->isThumb() && !ST->hasThumb2())
+    return false;
 
-  if (auto *I = dyn_cast<Instruction>(V))
-    return isNarrowInstSupported(I);
+  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
+    return false;
 
-  return false;
+  // TODO
+  // Would it be profitable? For Thumb code, these parallel DSP instructions
+  // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
+  // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
+  // halved. They also do not take immediates as operands.
+  for (auto &Op : I->operands()) {
+    if (isa<Constant>(Op)) {
+      if (!EnableDSPWithImms)
+        return false;
+    }
+  }
+  return true;
 }
 
 bool ARMCodeGenPrepare::TryToPromote(Value *V) {
   OrigTy = V->getType();
   TypeSize = OrigTy->getPrimitiveSizeInBits();
+  if (TypeSize > 16)
+    return false;
 
   if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
     return false;
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << ", TypeSize = "
+             << TypeSize << "\n");
 
   SetVector<Value*> WorkList;
   SmallPtrSet<Value*, 8> Leaves;
@@ -584,6 +558,10 @@
     if (CurrentVisited.count(V))
       return true;
 
+    // Ignore pointer value that aren't instructions.
+    if (!isa<Instruction>(V) && isa<PointerType>(V->getType()))
+      return true;
+
     if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
       LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n");
       return false;
@@ -638,41 +616,10 @@
     }
   }
 
-  unsigned NumToPromote = 0;
-  unsigned Cost = 0;
-  for (auto *V : CurrentVisited) {
-    // Truncs will cause a uxt and no zeroext arguments will often require
-    // a uxt somewhere.
-    if (isa<TruncInst>(V))
-      ++Cost;
-    else if (auto *Arg = dyn_cast<Argument>(V)) {
-      if (!Arg->hasZExtAttr())
-        ++Cost;
-    }
-
-    // Mem ops can automatically be extended/truncated and non-instructions
-    // don't need anything done.
-    if (Leaves.count(V) || isa<StoreInst>(V) || !isa<Instruction>(V))
-      continue;
-
-    // Will need to truncate calls args and returns.
-    if (Roots.count(cast<Instruction>(V))) {
-      ++Cost;
-      continue;
-    }
-
-    if (shouldPromote(V))
-      ++NumToPromote;
-  }
-
   LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
              for (auto *I : CurrentVisited)
                I->dump();
              );
-  LLVM_DEBUG(dbgs() << "ARM CGP: Cost of promoting " << NumToPromote
-             << " instructions = " << Cost << "\n");
-  if (Cost > NumToPromote || (NumToPromote == 0))
-    return false;
 
   Promoter->Mutate(OrigTy, CurrentVisited, Leaves, Roots);
   return true;
@@ -712,12 +659,8 @@
 
         LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
         for (auto &Op : CI.operands()) {
-          if (auto *I = dyn_cast<Instruction>(Op)) {
-            if (isa<ZExtInst>(I))
-              MadeChange |= TryToPromote(I->getOperand(0));
-            else
-              MadeChange |= TryToPromote(I);
-          }
+          if (auto *I = dyn_cast<Instruction>(Op))
+            MadeChange |= TryToPromote(I);
         }
       }
     }
@@ -744,6 +687,7 @@
                     false, false)
 
 char ARMCodeGenPrepare::ID = 0;
+unsigned ARMCodeGenPrepare::TypeSize = 0;
 
 FunctionPass *llvm::createARMCodeGenPreparePass() {
   return new ARMCodeGenPrepare();
Index: test/CodeGen/ARM/arm-cgp-icmps.ll
===================================================================
--- test/CodeGen/ARM/arm-cgp-icmps.ll
+++ test/CodeGen/ARM/arm-cgp-icmps.ll
@@ -158,39 +158,6 @@
   ret i32 %res
 }
 
-; CHECK-COMMON-LABEL: dsp_imm2
-; CHECK-COMMON:   add   r0, r1
-; CHECK-DSP-NEXT: ldrh  r1, [r3]
-; CHECK-DSP-NEXT: ldrh  r2, [r2]
-; CHECK-DSP-NEXT: subs  r1, r1, r0
-; CHECK-DSP-NEXT: add   r0, r2
-; CHECK-DSP-NEXT: uxth  r3, r1
-; CHECK-DSP-NEXT: uxth  r2, r0
-; CHECK-DSP-NEXT: cmp   r2, r3
-
-; CHECK-DSP-IMM:      movs  r1, #0
-; CHECK-DSP-IMM-NEXT: uxth  r0, r0
-; CHECK-DSP-IMM-NEXT: usub16  r1, r1, r0
-; CHECK-DSP-IMM-NEXT: ldrh  r0, [r2]
-; CHECK-DSP-IMM-NEXT: ldrh  r3, [r3]
-; CHECK-DSP-IMM-NEXT: usub16  r0, r0, r1
-; CHECK-DSP-IMM-NEXT: uadd16  r1, r3, r1
-; CHECK-DSP-IMM-NEXT: cmp r0, r1
-
-define i16 @dsp_imm2(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) {
-entry:
-  %add0 = add i32 %arg0, %arg1
-  %conv0 = trunc i32 %add0 to i16
-  %sub0 = sub i16 0, %conv0
-  %load0 = load i16, i16* %gep0, align 2
-  %load1 = load i16, i16* %gep1, align 2
-  %sub1 = sub i16 %load0, %sub0
-  %add1 = add i16 %load1, %sub0
-  %cmp = icmp ult i16 %sub1, %add1
-  %res = select i1 %cmp, i16 %add1, i16 %sub1
-  ret i16 %res
-}
-
 ; CHECK-COMMON-LABEL: dsp_var:
 ; CHECK-COMMON:   eors    r1, r0
 ; CHECK-COMMON:   and     r2, r0, #7
@@ -267,109 +234,6 @@
   ret i32 %res
 }
 
-; CHECK-COMMON-LABEL: icmp_i32_zext:
-; CHECK-COMMON:     ldrb [[LD:r[^ ]+]], [r0]
-; CHECK-COMMON:     subs [[SUB:r[^ ]+]], [[LD]], #1
-; CHECK-COMMON-NOT: uxt
-; CHECK-COMMON:     cmp [[LD]], [[SUB]]
-; CHECK-COMMON-NOT: uxt
-define i8 @icmp_i32_zext(i8* %ptr) {
-entry:
-  %gep = getelementptr inbounds i8, i8* %ptr, i32 0
-  %0 = load i8, i8* %gep, align 1
-  %1 = sub nuw nsw i8 %0, 1
-  %conv44 = zext i8 %0 to i32
-  br label %preheader
-
-preheader:
-  br label %body
-
-body:
-  %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ]
-  %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ]
-  %conv51266 = zext i8 %2 to i32
-  %cmp52267 = icmp eq i32 %si.0274, %conv51266
-  br i1 %cmp52267, label %if.end, label %exit
-
-if.end:
-  %inc = add i32 %si.0274, 1
-  %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc
-  %3 = load i8, i8* %gep1, align 1
-  br label %body
-
-exit:
-  ret i8 %2
-}
-
-@d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1
-@sh1 = hidden local_unnamed_addr global i16 0, align 2
-@d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2
-
-; CHECK-COMMON-LABEL: icmp_sext_zext_store_i8_i16
-; CHECK-NODSP: ldrb [[BYTE:r[^ ]+]],
-; CHECK-NODSP: strh [[BYTE]],
-; CHECK-NODSP: ldrsh.w
-define i32 @icmp_sext_zext_store_i8_i16() {
-entry:
-  %0 = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1
-  %conv = zext i8 %0 to i16
-  store i16 %conv, i16* @sh1, align 2
-  %conv1 = zext i8 %0 to i32
-  %1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2
-  %conv2 = sext i16 %1 to i32
-  %cmp = icmp eq i32 %conv1, %conv2
-  %conv3 = zext i1 %cmp to i32
-  ret i32 %conv3
-}
-
-; CHECK-COMMON-LABEL: or_icmp_ugt:
-; CHECK-COMMON:     ldrb [[LD:r[^ ]+]], [r1]
-; CHECK-COMMON:     subs [[SUB:r[^ ]+]], #1
-; CHECK-COMMON-NOT: uxtb
-; CHECK-COMMON:     cmp [[SUB]], #3
-define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) {
-entry:
-  %0 = load i8, i8* %ptr
-  %1 = zext i8 %0 to i32
-  %mul = shl nuw nsw i32 %1, 1
-  %add0 = add nuw nsw i32 %mul, 6
-  %cmp0 = icmp ne i32 %arg, %add0
-  %add1 = add i8 %0, -1
-  %cmp1 = icmp ugt i8 %add1, 3
-  %or = or i1 %cmp0, %cmp1
-  ret i1 %or
-}
-
-; CHECK-COMMON-LABEL: icmp_switch_trunc:
-; CHECK-COMMON-NOT: uxt
-define i16 @icmp_switch_trunc(i16 zeroext %arg) {
-entry:
-  %conv = add nuw i16 %arg, 15
-  %mul = mul nuw nsw i16 %conv, 3
-  %trunc = trunc i16 %arg to i3
-  switch i3 %trunc, label %default [
-    i3 0, label %sw.bb
-    i3 1, label %sw.bb.i
-  ]
-
-sw.bb:
-  %cmp0 = icmp ult i16 %mul, 127
-  %select = select i1 %cmp0, i16 %mul, i16 127
-  br label %exit
-
-sw.bb.i:
-  %cmp1 = icmp ugt i16 %mul, 34
-  %select.i = select i1 %cmp1, i16 %mul, i16 34
-  br label %exit
-
-default:
-  br label %exit
-
-exit:
-  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
-  ret i16 %res
-}
-
 ; CHECK-COMMON-LABEL: icmp_eq_minus_one
 ; CHECK-COMMON: cmp r0, #255
 define i32 @icmp_eq_minus_one(i8* %ptr) {
@@ -392,77 +256,3 @@
   ret i32 %res
 }
 
-; CHECK-COMMON-LABEL: mul_wrap
-; CHECK-COMMON: mul
-; CHECK-COMMON: uxth
-; CHECK-COMMON: cmp
-define i16 @mul_wrap(i16 %arg0, i16 %arg1) {
-  %mul = mul i16 %arg0, %arg1
-  %cmp = icmp eq i16 %mul, 1
-  %res = select i1 %cmp, i16 %arg0, i16 47
-  ret i16 %res
-}
-
-; CHECK-COMMON-LABEL: shl_wrap
-; CHECK-COMMON: lsl
-; CHECK-COMMON: uxth
-; CHECK-COMMON: cmp
-define i16 @shl_wrap(i16 %arg0) {
-  %mul = shl i16 %arg0, 4
-  %cmp = icmp eq i16 %mul, 1
-  %res = select i1 %cmp, i16 %arg0, i16 47
-  ret i16 %res
-}
-
-; CHECK-COMMON-LABEL: add_wrap
-; CHECK-COMMON: add
-; CHECK-COMMON: uxth
-; CHECK-COMMON: cmp
-define i16 @add_wrap(i16 %arg0, i16 %arg1) {
-  %add = add i16 %arg0, 128
-  %cmp = icmp eq i16 %add, %arg1
-  %res = select i1 %cmp, i16 %arg0, i16 1
-  ret i16 %res
-}
-
-; CHECK-COMMON-LABEL: sub_wrap
-; CHECK-COMMON: sub
-; CHECK-COMMON: uxth
-; CHECK-COMMON: cmp
-define i16 @sub_wrap(i16 %arg0, i16 %arg1, i16 %arg2) {
-  %sub = sub i16 %arg0, %arg2
-  %cmp = icmp eq i16 %sub, %arg1
-  %res = select i1 %cmp, i16 %arg0, i16 1
-  ret i16 %res
-}
-
-; CHECK-COMMON-LABEL: urem_trunc_icmps
-; CHECK-COMMON-NOT: uxt
-define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) {
-entry:
-  %ptr = load i16*, i16** %in, align 4
-  %ld = load i16, i16* %ptr, align 2
-  %cmp.i = icmp eq i16 %ld, 0
-  br i1 %cmp.i, label %exit, label %cond.false.i
-
-cond.false.i:
-  %rem = urem i16 5, %ld
-  %extract.t = trunc i16 %rem to i8
-  br label %body
-
-body:
-  %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ]
-  %cmp = icmp ugt i8 %cond.in.i.off0, 7
-  %conv5 = zext i1 %cmp to i32
-  store i32 %conv5, i32* %g, align 4
-  %.pr = load i32, i32* %k, align 4
-  %tobool13150 = icmp eq i32 %.pr, 0
-  br i1 %tobool13150, label %for.inc, label %exit
-
-for.inc:
-  %add = add nuw i8 %cond.in.i.off0, 1
-  br label %body
-
-exit:
-  ret void
-}
Index: test/CodeGen/ARM/arm-cgp-overflow.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/arm-cgp-overflow.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s
+
+; CHECK: overflow_add
+; CHECK: add
+; CHECK: uxth
+; CHECK: cmp
+define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) {
+  %add = add i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+; CHECK-LABEL: overflow_sub
+; CHECK: sub
+; CHECK: uxth
+; CHECK: cmp
+define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) {
+  %add = sub i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+; CHECK-LABEL: overflow_mul
+; CHECK: mul
+; CHECK: uxth
+; CHECK: cmp
+define zeroext i16 @overflow_mul(i16 zeroext %a, i16 zeroext %b) {
+  %add = mul i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+; CHECK-LABEL: overflow_shl
+; CHECK-COMMON: lsl
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) {
+  %add = shl i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
Index: test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll
===================================================================
--- test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll
+++ test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll
@@ -116,48 +116,6 @@
   ret void
 }
 
-; CHECK-COMMON-LABEL: phi_feeding_switch
-; CHECK-COMMON: ldrb
-; CHECK-COMMON: uxtb
-; CHECK-COMMON-NOT: uxt
-define void @phi_feeding_switch(i8* %memblock, i8* %store, i16 %arg) {
-entry:
-  %pre = load i8, i8* %memblock, align 1
-  %conv = trunc i16 %arg to i8
-  br label %header
-
-header:
-  %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ]
-  %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ]
-  %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ]
-  switch i8 %phi.0, label %default [
-    i8 43, label %for.inc.i
-    i8 45, label %for.inc.i.i
-  ]
-
-for.inc.i:
-  %xor = xor i8 %phi.1, 1
-  br label %latch
-
-for.inc.i.i:
-  %and = and i8 %phi.1, 3
-  br label %latch
-
-default:
-  %sub = sub i8 %phi.0, 1
-  %cmp2 = icmp ugt i8 %sub, 4
-  br i1 %cmp2, label %latch, label %exit
-
-latch:
-  %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ]
-  %count = add nuw i8 %phi.2, 1
-  store i8 %count, i8* %store, align 1
-  br label %header
-
-exit:
-  ret void
-}
-
 ; CHECK-COMMON-LABEL: ret_i8
 ; CHECK-COMMON-NOT:   uxt
 define i8 @ret_i8() {
@@ -186,33 +144,6 @@
   ret i8 %inc2
 }
 
-; Check that %exp requires uxth in all cases, and will also be required to
-; promote %1 for the call - unless we can generate a uadd16.
-; CHECK-COMMON-LABEL: zext_load_sink_call:
-; CHECK-COMMON:       uxt
-; CHECK-DSP-IMM:      uadd16
-; CHECK-COMMON:       cmp
-; CHECK-DSP:          uxt
-; CHECK-DSP-IMM-NOT:  uxt
-define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) {
-entry:
-  %0 = load i16, i16* %ptr, align 4
-  %1 = add i16 %exp, 3
-  %cmp = icmp eq i16 %0, %exp
-  br i1 %cmp, label %exit, label %if.then
-
-if.then:
-  %conv0 = zext i16 %0 to i32
-  %conv1 = zext i16 %1 to i32
-  %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1)
-  br label %exit
-
-exit:
-  %exitval = phi i32 [ %call, %if.then ], [ 0, %entry  ]
-  ret i32 %exitval
-}
-
-
 ; Check that the pass doesn't try to promote the immediate parameters.
 ; CHECK-COMMON-LABEL: call_with_imms
 ; CHECK-COMMON-NOT:   uxt
@@ -301,9 +232,10 @@
   ret i32 undef
 }
 
+; Transform will bail because of the zext
 ; Check that d.sroa.0.0.be is promoted passed directly into the tail call.
 ; CHECK-COMMON-LABEL: check_zext_phi_call_arg
-; CHECK-COMMON-NOT: uxt
+; CHECK-COMMON: uxt
 define i32 @check_zext_phi_call_arg() {
 entry:
   br label %for.cond
@@ -385,7 +317,6 @@
 declare dso_local i32 @e(...) local_unnamed_addr #1
 declare dso_local zeroext i16 @f(...) local_unnamed_addr #1
 
-declare i32 @dummy(i32, i32)
 declare i8 @dummy_i8(i8)
 declare i8 @dummy2(i8*, i8, i8)
 declare i16 @dummy3(i16)
Index: test/CodeGen/ARM/arm-cgp-zext-truncs.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/arm-cgp-zext-truncs.ll
@@ -0,0 +1,292 @@
+; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
+; RUN: llc -mtriple=thumbv7-linux-android %s -arm-disable-cgp=false -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
+; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumbv8 %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM
+
+; Transform will fail because the trunc is not a sink.
+; CHECK-COMMON-LABEL: dsp_trunc
+; CHECK-COMMON:   add   [[ADD:[^ ]+]],
+; CHECK-DSP-NEXT: ldrh  r1, [r3]
+; CHECK-DSP-NEXT: ldrh  r2, [r2]
+; CHECK-DSP-NEXT: subs  r1, r1, [[ADD]]
+; CHECK-DSP-NEXT: add   r0, r2
+; CHECK-DSP-NEXT: uxth  r3, r1
+; CHECK-DSP-NEXT: uxth  r2, r0
+; CHECK-DSP-NEXT: cmp   r2, r3
+
+; With DSP-IMM, we could have:
+; movs  r1, #0
+; uxth  r0, r0
+; usub16  r1, r1, r0
+; ldrh  r0, [r2]
+; ldrh  r3, [r3]
+; usub16  r0, r0, r1
+; uadd16  r1, r3, r1
+; cmp r0, r1
+define i16 @dsp_trunc(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) {
+entry:
+  %add0 = add i32 %arg0, %arg1
+  %conv0 = trunc i32 %add0 to i16
+  %sub0 = sub i16 0, %conv0
+  %load0 = load i16, i16* %gep0, align 2
+  %load1 = load i16, i16* %gep1, align 2
+  %sub1 = sub i16 %load0, %sub0
+  %add1 = add i16 %load1, %sub0
+  %cmp = icmp ult i16 %sub1, %add1
+  %res = select i1 %cmp, i16 %add1, i16 %sub1
+  ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: trunc_i16_i8
+; CHECK-COMMON: ldrh
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: cmp
+define i8 @trunc_i16_i8(i16* %ptr, i16 zeroext %arg0, i8 zeroext %arg1) {
+entry:
+  %0 = load i16, i16* %ptr
+  %1 = add i16 %0, %arg0
+  %2 = trunc i16 %1 to i8
+  %3 = icmp ugt i8 %2, %arg1
+  %4 = select i1 %3, i8 %2, i8 %arg1
+  ret i8 %4
+}
+
+; The pass will bail because of the zext, otherwise we'd want something like:
+; ldrb [[LD:r[^ ]+]], [r0]
+; subs [[SUB:r[^ ]+]], [[LD]], #1
+; cmp [[LD]], [[SUB]]
+; CHECK-COMMON-LABEL: icmp_i32_zext:
+; CHECK-COMMON: uxtb
+define i8 @icmp_i32_zext(i8* %ptr) {
+entry:
+  %gep = getelementptr inbounds i8, i8* %ptr, i32 0
+  %0 = load i8, i8* %gep, align 1
+  %1 = sub nuw nsw i8 %0, 1
+  %conv44 = zext i8 %0 to i32
+  br label %preheader
+
+preheader:
+  br label %body
+
+body:
+  %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ]
+  %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ]
+  %conv51266 = zext i8 %2 to i32
+  %cmp52267 = icmp eq i32 %si.0274, %conv51266
+  br i1 %cmp52267, label %if.end, label %exit
+
+if.end:
+  %inc = add i32 %si.0274, 1
+  %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc
+  %3 = load i8, i8* %gep1, align 1
+  br label %body
+
+exit:
+  ret i8 %2
+}
+
+; Won't handle zext or sext
+; CHECK-COMMON-LABEL: icmp_sext_zext_store_i8_i16
+define i32 @icmp_sext_zext_store_i8_i16() {
+entry:
+  %0 = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1
+  %conv = zext i8 %0 to i16
+  store i16 %conv, i16* @sh1, align 2
+  %conv1 = zext i8 %0 to i32
+  %1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2
+  %conv2 = sext i16 %1 to i32
+  %cmp = icmp eq i32 %conv1, %conv2
+  %conv3 = zext i1 %cmp to i32
+  ret i32 %conv3
+}
+
+; Pass will bail because of the zext, otherwise:
+; ldrb [[LD:r[^ ]+]], [r1]
+; subs [[SUB:r[^ ]+]], #1
+; cmp [[SUB]], #3
+; CHECK-COMMON-LABEL: or_icmp_ugt:
+; CHECK-COMMON: uxt
+define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) {
+entry:
+  %0 = load i8, i8* %ptr
+  %1 = zext i8 %0 to i32
+  %mul = shl nuw nsw i32 %1, 1
+  %add0 = add nuw nsw i32 %mul, 6
+  %cmp0 = icmp ne i32 %arg, %add0
+  %add1 = add i8 %0, -1
+  %cmp1 = icmp ugt i8 %add1, 3
+  %or = or i1 %cmp0, %cmp1
+  ret i1 %or
+}
+
+; CHECK-COMMON-LABEL: icmp_switch_trunc:
+; CHECK-COMMON-NOT: uxt
+define i16 @icmp_switch_trunc(i16 zeroext %arg) {
+entry:
+  %conv = add nuw i16 %arg, 15
+  %mul = mul nuw nsw i16 %conv, 3
+  %trunc = trunc i16 %arg to i3
+  switch i3 %trunc, label %default [
+    i3 0, label %sw.bb
+    i3 1, label %sw.bb.i
+  ]
+
+sw.bb:
+  %cmp0 = icmp ult i16 %mul, 127
+  %select = select i1 %cmp0, i16 %mul, i16 127
+  br label %exit
+
+sw.bb.i:
+  %cmp1 = icmp ugt i16 %mul, 34
+  %select.i = select i1 %cmp1, i16 %mul, i16 34
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
+  ret i16 %res
+}
+
+; Pass will bail because of the zext
+; CHECK-COMMON-LABEL: urem_trunc_icmps
+; CHECK-COMMON: uxt
+define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) {
+entry:
+  %ptr = load i16*, i16** %in, align 4
+  %ld = load i16, i16* %ptr, align 2
+  %cmp.i = icmp eq i16 %ld, 0
+  br i1 %cmp.i, label %exit, label %cond.false.i
+
+cond.false.i:
+  %rem = urem i16 5, %ld
+  %extract.t = trunc i16 %rem to i8
+  br label %body
+
+body:
+  %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ]
+  %cmp = icmp ugt i8 %cond.in.i.off0, 7
+  %conv5 = zext i1 %cmp to i32
+  store i32 %conv5, i32* %g, align 4
+  %.pr = load i32, i32* %k, align 4
+  %tobool13150 = icmp eq i32 %.pr, 0
+  br i1 %tobool13150, label %for.inc, label %exit
+
+for.inc:
+  %add = add nuw i8 %cond.in.i.off0, 1
+  br label %body
+
+exit:
+  ret void
+}
+
+; CHECK-COMMON-LABEL: phi_feeding_switch
+; CHECK-COMMON: ldrb
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: uxtb
+define void @phi_feeding_switch(i8* %memblock, i8* %store, i16 %arg) {
+entry:
+  %pre = load i8, i8* %memblock, align 1
+  %conv = trunc i16 %arg to i8
+  br label %header
+
+header:
+  %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ]
+  %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ]
+  %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ]
+  switch i8 %phi.0, label %default [
+    i8 43, label %for.inc.i
+    i8 45, label %for.inc.i.i
+  ]
+
+for.inc.i:
+  %xor = xor i8 %phi.1, 1
+  br label %latch
+
+for.inc.i.i:
+  %and = and i8 %phi.1, 3
+  br label %latch
+
+default:
+  %sub = sub i8 %phi.0, 1
+  %cmp2 = icmp ugt i8 %sub, 4
+  br i1 %cmp2, label %latch, label %exit
+
+latch:
+  %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ]
+  %count = add nuw i8 %phi.2, 1
+  store i8 %count, i8* %store, align 1
+  br label %header
+
+exit:
+  ret void
+}
+
+; Again, zexts will prevent the transform.
+; Check that %exp requires uxth in all cases, and will also be required to
+; promote %1 for the call - unless we can generate a uadd16.
+; CHECK-COMMON-LABEL: zext_load_sink_call:
+; CHECK-COMMON: uxt
+; uadd16
+; cmp
+; CHECK-COMMON: uxt
+define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) {
+entry:
+  %0 = load i16, i16* %ptr, align 4
+  %1 = add i16 %exp, 3
+  %cmp = icmp eq i16 %0, %exp
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %conv0 = zext i16 %0 to i32
+  %conv1 = zext i16 %1 to i32
+  %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1)
+  br label %exit
+
+exit:
+  %exitval = phi i32 [ %call, %if.then ], [ 0, %entry  ]
+  ret i32 %exitval
+}
+
+%class.ae = type { i8 }
+%class.x = type { i8 }
+%class.v = type { %class.q }
+%class.q = type { i16 }
+
+; CHECK-COMMON-LABEL: trunc_i16_i9_switch
+; CHECK-COMMON-NOT: uxt
+define i32 @trunc_i16_i9_switch(%class.ae* %this) {
+entry:
+  %call = tail call %class.x* @_ZNK2ae2afEv(%class.ae* %this)
+  %call2 = tail call %class.v* @_ZN1x2acEv(%class.x* %call)
+  %0 = getelementptr inbounds %class.v, %class.v* %call2, i32 0, i32 0, i32 0
+  %1 = load i16, i16* %0, align 2
+  %2 = trunc i16 %1 to i9
+  %trunc = and i9 %2, -64
+  switch i9 %trunc, label %cleanup.fold.split [
+    i9 0, label %cleanup
+    i9 -256, label %if.then7
+  ]
+
+if.then7:
+  %3 = and i16 %1, 7
+  %tobool = icmp eq i16 %3, 0
+  %cond = select i1 %tobool, i32 2, i32 1
+  br label %cleanup
+
+cleanup.fold.split:
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i32 [ %cond, %if.then7 ], [ 0, %entry ], [ 2, %cleanup.fold.split ]
+  ret i32 %retval.0
+}
+
+declare %class.x* @_ZNK2ae2afEv(%class.ae*) local_unnamed_addr
+declare %class.v* @_ZN1x2acEv(%class.x*) local_unnamed_addr
+declare i32 @dummy(i32, i32)
+
+@d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1
+@sh1 = hidden local_unnamed_addr global i16 0, align 2
+@d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2