Index: llvm/trunk/lib/Target/ARM/ARM.h
===================================================================
--- llvm/trunk/lib/Target/ARM/ARM.h
+++ llvm/trunk/lib/Target/ARM/ARM.h
@@ -43,6 +43,7 @@
 FunctionPass *createA15SDOptimizerPass();
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMCodeGenPreparePass();
 FunctionPass *createARMConstantIslandPass();
 FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
@@ -64,6 +65,7 @@
 void initializeARMParallelDSPPass(PassRegistry &);
 void initializeARMLoadStoreOptPass(PassRegistry &);
 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+void initializeARMCodeGenPreparePass(PassRegistry &);
 void initializeARMConstantIslandsPass(PassRegistry &);
 void initializeARMExpandPseudoPass(PassRegistry &);
 void initializeThumb2SizeReducePass(PassRegistry &);
Index: llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -0,0 +1,746 @@
+//===----- ARMCodeGenPrepare.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass inserts intrinsics to handle small types that would otherwise be
+/// promoted during legalization. Here we can manually promote types or insert
+/// intrinsics which can handle narrow types that aren't supported by the
+/// register classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "arm-codegenprepare"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableCGP("arm-disable-cgp", cl::Hidden, cl::init(false),
+           cl::desc("Disable ARM specific CodeGenPrepare pass"));
+
+static cl::opt<bool>
+EnableDSP("arm-enable-scalar-dsp", cl::Hidden, cl::init(false),
+          cl::desc("Use DSP instructions for scalar operations"));
+
+static cl::opt<bool>
+EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false),
+                   cl::desc("Use DSP instructions for scalar operations\
+                            with immediate operands"));
+
+namespace {
+
+class IRPromoter {
+  SmallPtrSet<Value*, 8> NewInsts;
+  SmallVector<Instruction*, 4> InstsToRemove;
+  Module *M = nullptr;
+  LLVMContext &Ctx;
+
+public:
+  IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { }
+
+  void Cleanup() {
+    for (auto *I : InstsToRemove) {
+      LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
+      I->dropAllReferences();
+      I->eraseFromParent();
+    }
+    InstsToRemove.clear();
+    NewInsts.clear();
+  }
+
+  void Mutate(Type *OrigTy,
+              SmallPtrSetImpl<Value*> &Visited,
+              SmallPtrSetImpl<Value*> &Leaves,
+              SmallPtrSetImpl<Instruction*> &Roots);
+};
+
+class ARMCodeGenPrepare : public FunctionPass {
+  const ARMSubtarget *ST = nullptr;
+  IRPromoter *Promoter = nullptr;
+  std::set<Value*> AllVisited;
+  Type *OrigTy = nullptr;
+  unsigned TypeSize = 0;
+
+  bool isNarrowInstSupported(Instruction *I);
+  bool isSupportedValue(Value *V);
+  bool isLegalToPromote(Value *V);
+  bool TryToPromote(Value *V);
+
+public:
+  static char ID;
+
+  ARMCodeGenPrepare() : FunctionPass(ID) {}
+
+  ~ARMCodeGenPrepare() { delete Promoter; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+  }
+
+  StringRef getPassName() const override { return "ARM IR optimizations"; }
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+};
+
+}
+
+/// Can the given value generate sign bits.
+static bool isSigned(Value *V) {
+  if (!isa<Instruction>(V))
+    return false;
+
+  unsigned Opc = cast<Instruction>(V)->getOpcode();
+  return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
+         Opc == Instruction::SRem;
+}
+
+/// Some instructions can use 8- and 16-bit operands, and we don't need to
+/// promote anything larger. We disallow booleans to make life easier when
+/// dealing with icmps but allow any other integer that is <= 16 bits. Void
+/// types are accepted so we can handle switches.
+static bool isSupportedType(Value *V) {
+  if (V->getType()->isVoidTy())
+    return true;
+
+  const IntegerType *IntTy = dyn_cast<IntegerType>(V->getType());
+  if (!IntTy)
+    return false;
+
+  // Don't try to promote boolean values.
+  if (IntTy->getBitWidth() == 1)
+    return false;
+
+  if (auto *ZExt = dyn_cast<ZExtInst>(V))
+    return isSupportedType(ZExt->getOperand(0));
+
+  return IntTy->getBitWidth() <= 16;
+}
+
+/// Return true if V will require any promoted values to be truncated for the
+/// use to be valid.
+static bool isSink(Value *V) {
+  auto UsesNarrowValue = [](Value *V) {
+    return V->getType()->getScalarSizeInBits() <= 32;
+  };
+
+  if (auto *Store = dyn_cast<StoreInst>(V))
+    return UsesNarrowValue(Store->getValueOperand());
+  if (auto *Return = dyn_cast<ReturnInst>(V))
+    return UsesNarrowValue(Return->getReturnValue());
+
+  return isa<CallInst>(V);
+}
+
+/// Return true if the given value is a leaf that will need to be zext'd.
+static bool isSource(Value *V) {
+  if (isa<Argument>(V) && isSupportedType(V))
+    return true;
+  else if (isa<TruncInst>(V))
+    return true;
+  else if (auto *ZExt = dyn_cast<ZExtInst>(V))
+    // ZExt can be a leaf if its the only user of a load.
+    return isa<LoadInst>(ZExt->getOperand(0)) &&
+                         ZExt->getOperand(0)->hasOneUse();
+  else if (auto *Call = dyn_cast<CallInst>(V))
+    return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+  else if (auto *Load = dyn_cast<LoadInst>(V)) {
+    if (!isa<IntegerType>(Load->getType()))
+      return false;
+    // A load is a leaf, unless its already just being zext'd.
+    if (Load->hasOneUse() && isa<ZExtInst>(*Load->use_begin()))
+      return false;
+
+    return true;
+  }
+  return false;
+}
+
+/// Return whether the instruction can be promoted within any modifications to
+/// it's operands or result.
+static bool isSafeOverflow(Instruction *I) {
+  if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
+    return true;
+
+  unsigned Opc = I->getOpcode();
+  if (Opc == Instruction::Add || Opc == Instruction::Sub) {
+    // We don't care if the add or sub could wrap if the value is decreasing
+    // and is only being used by an unsigned compare.
+    if (!I->hasOneUse() ||
+        !isa<ICmpInst>(*I->user_begin()) ||
+        !isa<ConstantInt>(I->getOperand(1)))
+      return false;
+
+    auto *CI = cast<ICmpInst>(*I->user_begin());
+    if (CI->isSigned())
+      return false;
+
+    bool NegImm = cast<ConstantInt>(I->getOperand(1))->isNegative();
+    bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
+                        ((Opc == Instruction::Add) && NegImm);
+    if (!IsDecreasing)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
+    return true;
+  }
+
+  // Otherwise, if an instruction is using a negative immediate we will need
+  // to fix it up during the promotion.
+  for (auto &Op : I->operands()) {
+    if (auto *Const = dyn_cast<ConstantInt>(Op))
+      if (Const->isNegative())
+        return false;
+  }
+  return false;
+}
+
+static bool shouldPromote(Value *V) {
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  if (!isa<IntegerType>(V->getType()))
+    return false;
+
+  if (isa<StoreInst>(I) || isa<TerminatorInst>(I) || isa<TruncInst>(I) ||
+      isa<ICmpInst>(I))
+    return false;
+
+  if (auto *ZExt = dyn_cast<ZExtInst>(I))
+    return !ZExt->getDestTy()->isIntegerTy(32);
+
+  return true;
+}
+
+/// Return whether we can safely mutate V's type to ExtTy without having to be
+/// concerned with zero extending or truncation.
+static bool isPromotedResultSafe(Value *V) {
+  if (!isa<Instruction>(V))
+    return true;
+
+  if (isSigned(V))
+    return false;
+
+  // If I is only being used by something that will require its value to be
+  // truncated, then we don't care about the promoted result.
+  auto *I = cast<Instruction>(V);
+  if (I->hasOneUse() && isSink(*I->use_begin()))
+    return true;
+
+  if (isa<OverflowingBinaryOperator>(I))
+    return isSafeOverflow(I);
+  return true;
+}
+
+/// Return the intrinsic for the instruction that can perform the same
+/// operation but on a narrow type. This is using the parallel dsp intrinsics
+/// on scalar values.
+static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) {
+  // Whether we use the signed or unsigned versions of these intrinsics
+  // doesn't matter because we're not using the GE bits that they set in
+  // the APSR.
+  switch(I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Add:
+    return TypeSize == 16 ? Intrinsic::arm_uadd16 :
+      Intrinsic::arm_uadd8;
+  case Instruction::Sub:
+    return TypeSize == 16 ? Intrinsic::arm_usub16 :
+      Intrinsic::arm_usub8;
+  }
+  llvm_unreachable("unhandled opcode for narrow intrinsic");
+}
+
+void IRPromoter::Mutate(Type *OrigTy,
+                        SmallPtrSetImpl<Value*> &Visited,
+                        SmallPtrSetImpl<Value*> &Leaves,
+                        SmallPtrSetImpl<Instruction*> &Roots) {
+  IRBuilder<> Builder{Ctx};
+  Type *ExtTy = Type::getInt32Ty(M->getContext());
+  unsigned TypeSize = OrigTy->getPrimitiveSizeInBits();
+  SmallPtrSet<Value*, 8> Promoted;
+  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << TypeSize
+        << " to 32-bits\n");
+
+  auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) {
+    SmallVector<Instruction*, 4> Users;
+    Instruction *InstTo = dyn_cast<Instruction>(To);
+    for (Use &U : From->uses()) {
+      auto *User = cast<Instruction>(U.getUser());
+      if (InstTo && User->isIdenticalTo(InstTo))
+        continue;
+      Users.push_back(User);
+    }
+
+    for (auto &U : Users)
+      U->replaceUsesOfWith(From, To);
+  };
+
+  auto FixConst = [&](ConstantInt *Const, Instruction *I) {
+    Constant *NewConst = nullptr;
+    if (isSafeOverflow(I)) {
+      NewConst = (Const->isNegative()) ?
+        ConstantExpr::getSExt(Const, ExtTy) :
+        ConstantExpr::getZExt(Const, ExtTy);
+    } else {
+      uint64_t NewVal = *Const->getValue().getRawData();
+      if (Const->getType() == Type::getInt16Ty(Ctx))
+        NewVal &= 0xFFFF;
+      else
+        NewVal &= 0xFF;
+      NewConst = ConstantInt::get(ExtTy, NewVal);
+    }
+    I->replaceUsesOfWith(Const, NewConst);
+  };
+
+  auto InsertDSPIntrinsic = [&](Instruction *I) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
+               << *I << "\n");
+    Function *DSPInst =
+      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I, TypeSize));
+    Builder.SetInsertPoint(I);
+    Builder.SetCurrentDebugLocation(I->getDebugLoc());
+    Value *Args[] = { I->getOperand(0), I->getOperand(1) };
+    CallInst *Call = Builder.CreateCall(DSPInst, Args);
+    ReplaceAllUsersOfWith(I, Call);
+    InstsToRemove.push_back(I);
+    NewInsts.insert(Call);
+  };
+
+  auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
+    Builder.SetInsertPoint(InsertPt);
+    if (auto *I = dyn_cast<Instruction>(V))
+      Builder.SetCurrentDebugLocation(I->getDebugLoc());
+    auto *ZExt = cast<Instruction>(Builder.CreateZExt(V, ExtTy));
+    if (isa<Argument>(V))
+      ZExt->moveBefore(InsertPt);
+    else
+      ZExt->moveAfter(InsertPt);
+    ReplaceAllUsersOfWith(V, ZExt);
+    NewInsts.insert(ZExt);
+  };
+
+  // First, insert extending instructions between the leaves and their users.
+  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting leaves:\n");
+  for (auto V : Leaves) {
+    LLVM_DEBUG(dbgs() << " - " << *V << "\n");
+    if (auto *ZExt = dyn_cast<ZExtInst>(V))
+      ZExt->mutateType(ExtTy);
+    else if (auto *I = dyn_cast<Instruction>(V))
+      InsertZExt(I, I);
+    else if (auto *Arg = dyn_cast<Argument>(V)) {
+      BasicBlock &BB = Arg->getParent()->front();
+      InsertZExt(Arg, &*BB.getFirstInsertionPt());
+    } else {
+      llvm_unreachable("unhandled leaf that needs extending");
+    }
+    Promoted.insert(V);
+  }
+
+  LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
+  // Then mutate the types of the instructions within the tree. Here we handle
+  // constant operands.
+  for (auto *V : Visited) {
+    if (Leaves.count(V))
+      continue;
+
+    if (!isa<Instruction>(V))
+      continue;
+
+    auto *I = cast<Instruction>(V);
+    if (Roots.count(I))
+      continue;
+
+    for (auto &U : I->operands()) {
+      if ((U->getType() == ExtTy) || !isSupportedType(&*U))
+        continue;
+
+      if (auto *Const = dyn_cast<ConstantInt>(&*U))
+        FixConst(Const, I);
+      else if (isa<UndefValue>(&*U))
+        U->mutateType(ExtTy);
+    }
+
+    if (shouldPromote(I)) {
+      I->mutateType(ExtTy);
+      Promoted.insert(I);
+    }
+  }
+
+  // Now we need to remove any zexts that have become unnecessary, as well
+  // as insert any intrinsics.
+  for (auto *V : Visited) {
+    if (Leaves.count(V))
+      continue;
+    if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
+      if (ZExt->getDestTy() != ExtTy) {
+        ZExt->mutateType(ExtTy);
+        Promoted.insert(ZExt);
+      }
+      else if (ZExt->getSrcTy() == ExtTy) {
+        ReplaceAllUsersOfWith(V, ZExt->getOperand(0));
+        InstsToRemove.push_back(ZExt);
+      }
+      continue;
+    }
+
+    if (!shouldPromote(V) || isPromotedResultSafe(V))
+      continue;
+
+    // Replace unsafe instructions with appropriate intrinsic calls.
+    InsertDSPIntrinsic(cast<Instruction>(V));
+  }
+
+  LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the roots:\n");
+  // Fix up any stores or returns that use the results of the promoted
+  // chain.
+  for (auto I : Roots) {
+    LLVM_DEBUG(dbgs() << " - " << *I << "\n");
+    Type *TruncTy = OrigTy;
+    if (auto *Store = dyn_cast<StoreInst>(I)) {
+      auto *PtrTy = cast<PointerType>(Store->getPointerOperandType());
+      TruncTy = PtrTy->getElementType();
+    } else if (isa<ReturnInst>(I)) {
+      Function *F = I->getParent()->getParent();
+      TruncTy = F->getFunctionType()->getReturnType();
+    }
+
+    for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+      Value *V = I->getOperand(i);
+      if (Promoted.count(V) || NewInsts.count(V)) {
+        if (auto *Op = dyn_cast<Instruction>(V)) {
+
+          if (auto *Call = dyn_cast<CallInst>(I))
+            TruncTy = Call->getFunctionType()->getParamType(i);
+
+          if (TruncTy == ExtTy)
+            continue;
+
+          LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy
+                     << " Trunc for " << *Op << "\n");
+          Builder.SetInsertPoint(Op);
+          auto *Trunc = cast<Instruction>(Builder.CreateTrunc(Op, TruncTy));
+          Trunc->moveBefore(I);
+          I->setOperand(i, Trunc);
+          NewInsts.insert(Trunc);
+        }
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete.\n");
+}
+
+bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) {
+  if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
+    return false;
+
+  if (ST->isThumb() && !ST->hasThumb2())
+    return false;
+
+  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
+    return false;
+
+  // TODO
+  // Would it be profitable? For Thumb code, these parallel DSP instructions
+  // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
+  // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
+  // halved. They also do not take immediates as operands.
+  for (auto &Op : I->operands()) {
+    if (isa<Constant>(Op)) {
+      if (!EnableDSPWithImms)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// We accept most instructions, as well as Arguments and ConstantInsts. We
+/// Disallow casts other than zext and truncs and only allow calls if their
+/// return value is zeroext. We don't allow opcodes that can introduce sign
+/// bits.
+bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
+  LLVM_DEBUG(dbgs() << "ARM CGP: Is " << *V << " supported?\n");
+
+  // Non-instruction values that we can handle.
+  if (isa<ConstantInt>(V) || isa<Argument>(V))
+    return true;
+
+  // Memory instructions
+  if (isa<StoreInst>(V) || isa<LoadInst>(V) || isa<GetElementPtrInst>(V))
+    return true;
+
+  // Branches and targets.
+  if (auto *ICmp = dyn_cast<ICmpInst>(V))
+    return ICmp->isEquality() || !ICmp->isSigned();
+
+  if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
+    return true;
+
+  if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V))
+    return true;
+
+  // Special cases for calls as we need to check for zeroext
+  // TODO We should accept calls even if they don't have zeroext, as they can
+  // still be roots.
+  if (auto *Call = dyn_cast<CallInst>(V))
+    return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+  else if (auto *Cast = dyn_cast<CastInst>(V)) {
+    if (isa<ZExtInst>(Cast))
+      return Cast->getDestTy()->getScalarSizeInBits() <= 32;
+    else if (auto *Trunc = dyn_cast<TruncInst>(V))
+      return Trunc->getDestTy()->getScalarSizeInBits() <= TypeSize;
+    else {
+      LLVM_DEBUG(dbgs() << "ARM CGP: No, unsupported cast.\n");
+      return false;
+    }
+  } else if (!isa<BinaryOperator>(V)) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: No, not a binary operator.\n");
+    return false;
+  }
+
+  bool res = !isSigned(V);
+  if (!res)
+    LLVM_DEBUG(dbgs() << "ARM CGP: No, it's a signed instruction.\n");
+  return res;
+}
+
+/// Check that the type of V would be promoted and that the original type is
+/// smaller than the targeted promoted type. Check that we're not trying to
+/// promote something larger than our base 'TypeSize' type.
+bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
+  if (!isSupportedType(V))
+    return false;
+
+  unsigned VSize = 0;
+  if (auto *Ld = dyn_cast<LoadInst>(V)) {
+    auto *PtrTy = cast<PointerType>(Ld->getPointerOperandType());
+    VSize = PtrTy->getElementType()->getPrimitiveSizeInBits();
+  } else if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
+    VSize = ZExt->getOperand(0)->getType()->getPrimitiveSizeInBits();
+  } else {
+    VSize = V->getType()->getPrimitiveSizeInBits();
+  }
+
+  if (VSize > TypeSize)
+    return false;
+
+  if (isPromotedResultSafe(V))
+    return true;
+
+  if (auto *I = dyn_cast<Instruction>(V))
+    return isNarrowInstSupported(I);
+
+  return false;
+}
+
+bool ARMCodeGenPrepare::TryToPromote(Value *V) {
+  OrigTy = V->getType();
+  TypeSize = OrigTy->getPrimitiveSizeInBits();
+
+  if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << "\n");
+
+  SetVector<Value*> WorkList;
+  SmallPtrSet<Value*, 8> Leaves;
+  SmallPtrSet<Instruction*, 4> Roots;
+  WorkList.insert(V);
+  SmallPtrSet<Value*, 16> CurrentVisited;
+  CurrentVisited.clear();
+
+  // Return true if the given value can, or has been, visited. Add V to the
+  // worklist if needed.
+  auto AddLegalInst = [&](Value *V) {
+    if (CurrentVisited.count(V))
+      return true;
+
+    if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
+      LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n");
+      return false;
+    }
+
+    WorkList.insert(V);
+    return true;
+  };
+
+  // Iterate through, and add to, a tree of operands and users in the use-def.
+  while (!WorkList.empty()) {
+    Value *V = WorkList.back();
+    WorkList.pop_back();
+    if (CurrentVisited.count(V))
+      continue;
+
+    if (!isa<Instruction>(V) && !isSource(V))
+      continue;
+
+    // If we've already visited this value from somewhere, bail now because
+    // the tree has already been explored.
+    // TODO: This could limit the transform, ie if we try to promote something
+    // from an i8 and fail first, before trying an i16.
+    if (AllVisited.count(V)) {
+      LLVM_DEBUG(dbgs() << "ARM CGP: Already visited this: " << *V << "\n");
+      return false;
+    }
+
+    CurrentVisited.insert(V);
+    AllVisited.insert(V);
+
+    // Calls can be both sources and sinks.
+    if (isSink(V))
+      Roots.insert(cast<Instruction>(V));
+    if (isSource(V))
+      Leaves.insert(V);
+    else if (auto *I = dyn_cast<Instruction>(V)) {
+      // Visit operands of any instruction visited.
+      for (auto &U : I->operands()) {
+        if (!AddLegalInst(U))
+          return false;
+      }
+    }
+
+    // Don't visit users of a node which isn't going to be mutated unless its a
+    // source.
+    if (isSource(V) || shouldPromote(V)) {
+      for (Use &U : V->uses()) {
+        if (!AddLegalInst(U.getUser()))
+          return false;
+      }
+    }
+  }
+
+  unsigned NumToPromote = 0;
+  unsigned Cost = 0;
+  for (auto *V : CurrentVisited) {
+    // Truncs will cause a uxt and no zeroext arguments will often require
+    // a uxt somewhere.
+    if (isa<TruncInst>(V))
+      ++Cost;
+    else if (auto *Arg = dyn_cast<Argument>(V)) {
+      if (!Arg->hasZExtAttr())
+        ++Cost;
+    }
+
+    // Mem ops can automatically be extended/truncated and non-instructions
+    // don't need anything done.
+    if (Leaves.count(V) || isa<StoreInst>(V) || !isa<Instruction>(V))
+      continue;
+
+    // Will need to truncate calls args and returns.
+    if (Roots.count(cast<Instruction>(V))) {
+      ++Cost;
+      continue;
+    }
+
+    if (shouldPromote(V))
+      ++NumToPromote;
+  }
+
+  LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
+             for (auto *I : CurrentVisited)
+               I->dump();
+             );
+  LLVM_DEBUG(dbgs() << "ARM CGP: Cost of promoting " << NumToPromote
+             << " instructions = " << Cost << "\n");
+  if (Cost > NumToPromote || (NumToPromote == 0))
+    return false;
+
+  Promoter->Mutate(OrigTy, CurrentVisited, Leaves, Roots);
+  return true;
+}
+
+bool ARMCodeGenPrepare::doInitialization(Module &M) {
+  Promoter = new IRPromoter(&M);
+  return false;
+}
+
+bool ARMCodeGenPrepare::runOnFunction(Function &F) {
+  if (skipFunction(F) || DisableCGP)
+    return false;
+
+  auto *TPC = &getAnalysis<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  const TargetMachine &TM = TPC->getTM<TargetMachine>();
+  ST = &TM.getSubtarget<ARMSubtarget>(F);
+  bool MadeChange = false;
+  LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F.getName() << "\n");
+
+  // Search up from icmps to try to promote their operands.
+  for (BasicBlock &BB : F) {
+    auto &Insts = BB.getInstList();
+    for (auto &I : Insts) {
+      if (AllVisited.count(&I))
+        continue;
+
+      if (isa<ICmpInst>(I)) {
+        auto &CI = cast<ICmpInst>(I);
+
+        // Skip signed or pointer compares
+        if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType()))
+          continue;
+
+        LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
+        for (auto &Op : CI.operands()) {
+          if (auto *I = dyn_cast<Instruction>(Op)) {
+            if (isa<ZExtInst>(I))
+              MadeChange |= TryToPromote(I->getOperand(0));
+            else
+              MadeChange |= TryToPromote(I);
+          }
+        }
+      }
+    }
+    Promoter->Cleanup();
+    LLVM_DEBUG(if (verifyFunction(F, &dbgs())) {
+                dbgs();
+                report_fatal_error("Broken function after type promotion");
+               });
+  }
+  if (MadeChange)
+    LLVM_DEBUG(dbgs() << "After ARMCodeGenPrepare: " << F << "\n");
+
+  return MadeChange;
+}
+
+INITIALIZE_PASS_BEGIN(ARMCodeGenPrepare, DEBUG_TYPE,
+                      "ARM IR optimizations", false, false)
+INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations",
+                    false, false)
+
+char ARMCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createARMCodeGenPreparePass() {
+  return new ARMCodeGenPrepare();
+}
Index: llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp
+++ llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp
@@ -90,6 +90,7 @@
   initializeARMLoadStoreOptPass(Registry);
   initializeARMPreAllocLoadStoreOptPass(Registry);
   initializeARMParallelDSPPass(Registry);
+  initializeARMCodeGenPreparePass(Registry);
   initializeARMConstantIslandsPass(Registry);
   initializeARMExecutionDomainFixPass(Registry);
   initializeARMExpandPseudoPass(Registry);
@@ -346,6 +347,7 @@
   }
 
   void addIRPasses() override;
+  void addCodeGenPrepare() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addIRTranslator() override;
@@ -402,6 +404,12 @@
     addPass(createInterleavedAccessPass());
 }
 
+void ARMPassConfig::addCodeGenPrepare() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createARMCodeGenPreparePass());
+  TargetPassConfig::addCodeGenPrepare();
+}
+
 bool ARMPassConfig::addPreISel() {
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createARMParallelDSPPass());
Index: llvm/trunk/lib/Target/ARM/CMakeLists.txt
===================================================================
--- llvm/trunk/lib/Target/ARM/CMakeLists.txt
+++ llvm/trunk/lib/Target/ARM/CMakeLists.txt
@@ -23,6 +23,7 @@
   ARMBaseInstrInfo.cpp
   ARMBaseRegisterInfo.cpp
   ARMCallLowering.cpp
+  ARMCodeGenPrepare.cpp
   ARMConstantIslandPass.cpp
   ARMConstantPoolValue.cpp
   ARMExpandPseudoInsts.cpp
Index: llvm/trunk/test/CodeGen/ARM/arm-cgp-icmps.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/arm-cgp-icmps.ll
+++ llvm/trunk/test/CodeGen/ARM/arm-cgp-icmps.ll
@@ -0,0 +1,468 @@
+; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
+; RUN: llc -mtriple=thumbv7em %s -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumbv8 %s -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM
+
+; CHECK-COMMON-LABEL: test_ult_254_inc_imm:
+; CHECK-DSP:        adds    r0, #1
+; CHECK-DSP-NEXT:   uxtb    r1, r0
+; CHECK-DSP-NEXT:   movs    r0, #47
+; CHECK-DSP-NEXT:   cmp     r1, #254
+; CHECK-DSP-NEXT:   it      lo
+; CHECK-DSP-NEXT:   movlo   r0, #35
+
+; CHECK-DSP-IMM:      movs r1, #1
+; CHECK-DSP-IMM-NEXT: uadd8 r1, r0, r1
+; CHECK-DSP-IMM-NEXT: movs  r0, #47
+; CHECK-DSP-IMM-NEXT: cmp r1, #254
+; CHECK-DSP-IMM-NEXT: it  lo
+; CHECK-DSP-IMM-NEXT: movlo r0, #35
+define i32 @test_ult_254_inc_imm(i8 zeroext %x) {
+entry:
+  %add = add i8 %x, 1
+  %cmp = icmp ult i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_slt_254_inc_imm
+; CHECK-COMMON: adds
+; CHECK-COMMON: sxtb
+define i32 @test_slt_254_inc_imm(i8 signext %x) {
+entry:
+  %add = add i8 %x, 1
+  %cmp = icmp slt i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_ult_254_inc_var:
+; CHECK-NODSP:      add     r0, r1
+; CHECK-NODSP-NEXT: uxtb    r1, r0
+; CHECK-NODSP-NEXT: movs    r0, #47
+; CHECK-NODSP-NEXT: cmp     r1, #254
+; CHECK-NODSP-NEXT: it      lo
+; CHECK-NODSP-NEXT: movlo   r0, #35
+
+; CHECK-DSP:        uadd8   r1, r0, r1
+; CHECK-DSP-NEXT:   movs    r0, #47
+; CHECK-DSP-NEXT:   cmp     r1, #254
+; CHECK-DSP-NEXT:   it      lo
+; CHECK-DSP-NEXT:   movlo   r0, #35
+define i32 @test_ult_254_inc_var(i8 zeroext %x, i8 zeroext %y) {
+entry:
+  %add = add i8 %x, %y
+  %cmp = icmp ult i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_sle_254_inc_var
+; CHECK-COMMON: add
+; CHECK-COMMON: sxtb
+; CHECK-COMMON: cmp
+define i32 @test_sle_254_inc_var(i8 %x, i8 %y) {
+entry:
+  %add = add i8 %x, %y
+  %cmp = icmp sle i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_ugt_1_dec_imm:
+; CHECK-COMMON:      subs    r1, r0, #1
+; CHECK-COMMON-NEXT: movs    r0, #47
+; CHECK-COMMON-NEXT: cmp     r1, #1
+; CHECK-COMMON-NEXT: it      hi
+; CHECK-COMMON-NEXT: movhi   r0, #35
+define i32 @test_ugt_1_dec_imm(i8 zeroext %x) {
+entry:
+  %add = add i8 %x, -1
+  %cmp = icmp ugt i8 %add, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_sgt_1_dec_imm
+; CHECK-COMMON: subs
+; CHECK-COMMON: sxtb
+; CHECK-COMMON: cmp
+define i32 @test_sgt_1_dec_imm(i8 %x) {
+entry:
+  %add = add i8 %x, -1
+  %cmp = icmp sgt i8 %add, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_ugt_1_dec_var:
+; CHECK-NODSP:      subs    r0, r0, r1
+; CHECK-NODSP-NEXT: uxtb    r1, r0
+; CHECK-NODSP-NEXT: movs    r0, #47
+; CHECK-NODSP-NEXT: cmp     r1, #1
+; CHECK-NODSP-NEXT: it      hi
+; CHECK-NODSP-NEXT: movhi   r0, #35
+
+; CHECK-DSP:      usub8   r1, r0, r1
+; CHECK-DSP-NEXT: movs    r0, #47
+; CHECK-DSP-NEXT: cmp     r1, #1
+; CHECK-DSP-NEXT: it      hi
+; CHECK-DSP-NEXT: movhi   r0, #35
+define i32 @test_ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) {
+entry:
+  %sub = sub i8 %x, %y
+  %cmp = icmp ugt i8 %sub, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_sge_1_dec_var
+; CHECK-COMMON: sub
+; CHECK-COMMON: sxtb
+; CHECK-COMMON: cmp
+define i32 @test_sge_1_dec_var(i8 %x, i8 %y) {
+entry:
+  %sub = sub i8 %x, %y
+  %cmp = icmp sge i8 %sub, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: dsp_imm1:
+; CHECK-DSP:      eors    r1, r0
+; CHECK-DSP-NEXT: and     r0, r0, #7
+; CHECK-DSP-NEXT: subs    r0, r0, r1
+; CHECK-DSP-NEXT: adds    r0, #1
+; CHECK-DSP-NEXT: uxtb    r1, r0
+; CHECK-DSP-NEXT: movs    r0, #47
+; CHECK-DSP-NEXT: cmp     r1, #254
+; CHECK-DSP-NEXT: it      lo
+; CHECK-DSP-NEXT: movlo   r0, #35
+
+; CHECK-DSP-IMM:      eors    r1, r0
+; CHECK-DSP-IMM-NEXT: and     r0, r0, #7
+; CHECK-DSP-IMM-NEXT: usub8   r0, r0, r1
+; CHECK-DSP-IMM-NEXT: movs    r1, #1
+; CHECK-DSP-IMM-NEXT: uadd8   r1, r0, r1
+; CHECK-DSP-IMM-NEXT: movs    r0, #47
+; CHECK-DSP-IMM-NEXT: cmp     r1, #254
+; CHECK-DSP-IMM-NEXT: it      lo
+; CHECK-DSP-IMM-NEXT: movlo   r0, #35
+define i32 @dsp_imm1(i8 zeroext %x, i8 zeroext %y) {
+entry:
+  %xor = xor i8 %x, %y
+  %and = and i8 %x, 7
+  %sub = sub i8 %and, %xor
+  %add = add i8 %sub, 1
+  %cmp = icmp ult i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: dsp_imm2
+; CHECK-COMMON:   add   r0, r1
+; CHECK-DSP-NEXT: ldrh  r1, [r3]
+; CHECK-DSP-NEXT: ldrh  r2, [r2]
+; CHECK-DSP-NEXT: subs  r1, r1, r0
+; CHECK-DSP-NEXT: add   r0, r2
+; CHECK-DSP-NEXT: uxth  r3, r1
+; CHECK-DSP-NEXT: uxth  r2, r0
+; CHECK-DSP-NEXT: cmp   r2, r3
+
+; CHECK-DSP-IMM:      movs  r1, #0
+; CHECK-DSP-IMM-NEXT: uxth  r0, r0
+; CHECK-DSP-IMM-NEXT: usub16  r1, r1, r0
+; CHECK-DSP-IMM-NEXT: ldrh  r0, [r2]
+; CHECK-DSP-IMM-NEXT: ldrh  r3, [r3]
+; CHECK-DSP-IMM-NEXT: usub16  r0, r0, r1
+; CHECK-DSP-IMM-NEXT: uadd16  r1, r3, r1
+; CHECK-DSP-IMM-NEXT: cmp r0, r1
+
+define i16 @dsp_imm2(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) {
+entry:
+  %add0 = add i32 %arg0, %arg1
+  %conv0 = trunc i32 %add0 to i16
+  %sub0 = sub i16 0, %conv0
+  %load0 = load i16, i16* %gep0, align 2
+  %load1 = load i16, i16* %gep1, align 2
+  %sub1 = sub i16 %load0, %sub0
+  %add1 = add i16 %load1, %sub0
+  %cmp = icmp ult i16 %sub1, %add1
+  %res = select i1 %cmp, i16 %add1, i16 %sub1
+  ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: dsp_var:
+; CHECK-COMMON:   eors    r1, r0
+; CHECK-COMMON:   and     r2, r0, #7
+; CHECK-NODSP:    subs    r1, r2, r1
+; CHECK-NODSP:    add.w   r0, r1, r0, lsl #1
+; CHECK-NODSP:    uxtb    r1, r0
+; CHECK-DSP:      usub8   r1, r2, r1
+; CHECK-DSP:      lsls    r0, r0, #1
+; CHECK-DSP:      uadd8   r1, r1, r0
+; CHECK-DSP-NOT:  uxt
+; CHECK-COMMON:   movs    r0, #47
+; CHECK-COMMON:   cmp     r1, #254
+; CHECK-COMMON:   it      lo
+; CHECK-COMMON:   movlo   r0, #35
+define i32 @dsp_var(i8 zeroext %x, i8 zeroext %y) {
+  %xor = xor i8 %x, %y
+  %and = and i8 %x, 7
+  %sub = sub i8 %and, %xor
+  %mul = shl nuw i8 %x, 1
+  %add = add i8 %sub, %mul
+  %cmp = icmp ult i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: store_dsp_res
+; CHECK-DSP: usub8 
+; CHECK-DSP: strb
+define void @store_dsp_res(i8* %in, i8* %out, i8 %compare) {
+  %first = getelementptr inbounds i8, i8* %in, i32 0
+  %second = getelementptr inbounds i8, i8* %in, i32 1
+  %ld0 = load i8, i8* %first
+  %ld1 = load i8, i8* %second
+  %xor = xor i8 %ld0, -1
+  %cmp = icmp ult i8 %compare, %ld1
+  %select = select i1 %cmp, i8 %compare, i8 %xor
+  %sub = sub i8 %ld0, %select
+  store i8 %sub, i8* %out, align 1
+  ret void
+}
+
+; CHECK-COMMON-LABEL: ugt_1_dec_imm:
+; CHECK-COMMON:      subs    r1, r0, #1
+; CHECK-COMMON-NEXT: movs    r0, #47
+; CHECK-COMMON-NEXT: cmp     r1, #1
+; CHECK-COMMON-NEXT: it      hi
+; CHECK-COMMON-NEXT: movhi   r0, #35
+define i32 @ugt_1_dec_imm(i8 zeroext %x) {
+entry:
+  %add = add i8 %x, -1
+  %cmp = icmp ugt i8 %add, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: ugt_1_dec_var:
+; CHECK-NODSP:      subs    r0, r0, r1
+; CHECK-NODSP-NEXT: uxtb    r1, r0
+; CHECK-NODSP-NEXT: movs    r0, #47
+; CHECK-NODSP-NEXT: cmp     r1, #1
+; CHECK-NODSP-NEXT: it      hi
+; CHECK-NODSP-NEXT: movhi   r0, #35
+
+; CHECK-DSP:      usub8   r1, r0, r1
+; CHECK-DSP-NEXT: movs    r0, #47
+; CHECK-DSP-NEXT: cmp     r1, #1
+; CHECK-DSP-NEXT: it      hi
+; CHECK-DSP-NEXT: movhi   r0, #35
+define i32 @ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) {
+entry:
+  %sub = sub i8 %x, %y
+  %cmp = icmp ugt i8 %sub, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: icmp_i32_zext:
+; CHECK-COMMON:     ldrb [[LD:r[^ ]+]], [r0]
+; CHECK-COMMON:     subs [[SUB:r[^ ]+]], [[LD]], #1
+; CHECK-COMMON-NOT: uxt
+; CHECK-COMMON:     cmp [[LD]], [[SUB]]
+; CHECK-COMMON-NOT: uxt
+define i8 @icmp_i32_zext(i8* %ptr) {
+entry:
+  %gep = getelementptr inbounds i8, i8* %ptr, i32 0
+  %0 = load i8, i8* %gep, align 1
+  %1 = sub nuw nsw i8 %0, 1
+  %conv44 = zext i8 %0 to i32
+  br label %preheader
+
+preheader:
+  br label %body
+
+body:
+  %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ]
+  %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ]
+  %conv51266 = zext i8 %2 to i32
+  %cmp52267 = icmp eq i32 %si.0274, %conv51266
+  br i1 %cmp52267, label %if.end, label %exit
+
+if.end:
+  %inc = add i32 %si.0274, 1
+  %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc
+  %3 = load i8, i8* %gep1, align 1
+  br label %body
+
+exit:
+  ret i8 %2
+}
+
+@d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1
+@sh1 = hidden local_unnamed_addr global i16 0, align 2
+@d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2
+
+; CHECK-COMMON-LABEL: icmp_sext_zext_store_i8_i16
+; CHECK-NODSP: ldrb [[BYTE:r[^ ]+]],
+; CHECK-NODSP: strh [[BYTE]],
+; CHECK-NODSP: ldrsh.w
+define i32 @icmp_sext_zext_store_i8_i16() {
+entry:
+  %0 = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1
+  %conv = zext i8 %0 to i16
+  store i16 %conv, i16* @sh1, align 2
+  %conv1 = zext i8 %0 to i32
+  %1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2
+  %conv2 = sext i16 %1 to i32
+  %cmp = icmp eq i32 %conv1, %conv2
+  %conv3 = zext i1 %cmp to i32
+  ret i32 %conv3
+}
+
+; CHECK-COMMON-LABEL: or_icmp_ugt:
+; CHECK-COMMON:     ldrb [[LD:r[^ ]+]], [r1]
+; CHECK-COMMON:     subs [[SUB:r[^ ]+]], #1
+; CHECK-COMMON-NOT: uxtb
+; CHECK-COMMON:     cmp [[SUB]], #3
+define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) {
+entry:
+  %0 = load i8, i8* %ptr
+  %1 = zext i8 %0 to i32
+  %mul = shl nuw nsw i32 %1, 1
+  %add0 = add nuw nsw i32 %mul, 6
+  %cmp0 = icmp ne i32 %arg, %add0
+  %add1 = add i8 %0, -1
+  %cmp1 = icmp ugt i8 %add1, 3
+  %or = or i1 %cmp0, %cmp1
+  ret i1 %or
+}
+
+; CHECK-COMMON-LABEL: icmp_switch_trunc:
+; CHECK-COMMON-NOT: uxt
+define i16 @icmp_switch_trunc(i16 zeroext %arg) {
+entry:
+  %conv = add nuw i16 %arg, 15
+  %mul = mul nuw nsw i16 %conv, 3
+  %trunc = trunc i16 %arg to i3
+  switch i3 %trunc, label %default [
+    i3 0, label %sw.bb
+    i3 1, label %sw.bb.i
+  ]
+
+sw.bb:
+  %cmp0 = icmp ult i16 %mul, 127
+  %select = select i1 %cmp0, i16 %mul, i16 127
+  br label %exit
+
+sw.bb.i:
+  %cmp1 = icmp ugt i16 %mul, 34
+  %select.i = select i1 %cmp1, i16 %mul, i16 34
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
+  ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: icmp_eq_minus_one
+; CHECK-COMMON: cmp r0, #255
+define i32 @icmp_eq_minus_one(i8* %ptr) {
+  %load = load i8, i8* %ptr, align 1
+  %conv = zext i8 %load to i32
+  %cmp = icmp eq i8 %load, -1
+  %ret = select i1 %cmp, i32 %conv, i32 -1
+  ret i32 %ret
+}
+
+; CHECK-COMMON-LABEL: icmp_not
+; CHECK-COMMON: movw r2, #65535
+; CHECK-COMMON: eors r2, r0
+; CHECK-COMMON: movs r0, #32
+; CHECK-COMMON: cmp r2, r1
+define i32 @icmp_not(i16 zeroext %arg0, i16 zeroext %arg1) {
+  %not = xor i16 %arg0, -1
+  %cmp = icmp eq i16 %not, %arg1
+  %res = select i1 %cmp, i32 16, i32 32
+  ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: mul_wrap
+; CHECK-COMMON: mul
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+define i16 @mul_wrap(i16 %arg0, i16 %arg1) {
+  %mul = mul i16 %arg0, %arg1
+  %cmp = icmp eq i16 %mul, 1
+  %res = select i1 %cmp, i16 %arg0, i16 47
+  ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: shl_wrap
+; CHECK-COMMON: lsl
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+define i16 @shl_wrap(i16 %arg0) {
+  %mul = shl i16 %arg0, 4
+  %cmp = icmp eq i16 %mul, 1
+  %res = select i1 %cmp, i16 %arg0, i16 47
+  ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: add_wrap
+; CHECK-COMMON: add
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+define i16 @add_wrap(i16 %arg0, i16 %arg1) {
+  %add = add i16 %arg0, 128
+  %cmp = icmp eq i16 %add, %arg1
+  %res = select i1 %cmp, i16 %arg0, i16 1
+  ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: sub_wrap
+; CHECK-COMMON: sub
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+define i16 @sub_wrap(i16 %arg0, i16 %arg1, i16 %arg2) {
+  %sub = sub i16 %arg0, %arg2
+  %cmp = icmp eq i16 %sub, %arg1
+  %res = select i1 %cmp, i16 %arg0, i16 1
+  ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: urem_trunc_icmps
+; CHECK-COMMON-NOT: uxt
+define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) {
+entry:
+  %ptr = load i16*, i16** %in, align 4
+  %ld = load i16, i16* %ptr, align 2
+  %cmp.i = icmp eq i16 %ld, 0
+  br i1 %cmp.i, label %exit, label %cond.false.i
+
+cond.false.i:
+  %rem = urem i16 5, %ld
+  %extract.t = trunc i16 %rem to i8
+  br label %body
+
+body:
+  %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ]
+  %cmp = icmp ugt i8 %cond.in.i.off0, 7
+  %conv5 = zext i1 %cmp to i32
+  store i32 %conv5, i32* %g, align 4
+  %.pr = load i32, i32* %k, align 4
+  %tobool13150 = icmp eq i32 %.pr, 0
+  br i1 %tobool13150, label %for.inc, label %exit
+
+for.inc:
+  %add = add nuw i8 %cond.in.i.off0, 1
+  br label %body
+
+exit:
+  ret void
+}
Index: llvm/trunk/test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll
+++ llvm/trunk/test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll
@@ -0,0 +1,392 @@
+; RUN: llc -mtriple=thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
+; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
+; RUN: llc -mtriple=thumbv8m.main -arm-enable-scalar-dsp=true -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumbv7em %s -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM
+
+; Test that ARMCodeGenPrepare can handle:
+; - loops
+; - call operands
+; - call return values
+; - ret instructions
+; We use nuw on the arithmetic instructions to avoid complications.
+
+; Check that the arguments are extended but then nothing else is.
+; This also ensures that the pass can handle loops.
+; CHECK-COMMON-LABEL: phi_feeding_phi_args
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: uxtb
+; CHECK-NOT: uxtb
+define void @phi_feeding_phi_args(i8 %a, i8 %b) {
+entry:
+  %0 = icmp ugt i8 %a, %b
+  br i1 %0, label %preheader, label %empty
+
+empty:
+  br label %preheader
+
+preheader:
+  %1 = phi i8 [ %a, %entry ], [ %b, %empty ]
+  br label %loop
+
+loop:
+  %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ]
+  %cmp = icmp ult i8 %val, 254
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %inc = sub nuw i8 %val, 2
+  br label %if.end
+
+if.else:
+  %inc1 = shl nuw i8 %val, 1
+  br label %if.end
+
+if.end:
+  %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
+  %cmp1 = icmp eq i8 %inc2, 255
+  br i1 %cmp1, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Same as above, but as the args are zeroext, we shouldn't see any uxts.
+; CHECK-COMMON-LABEL: phi_feeding_phi_zeroext_args
+; CHECK-COMMON-NOT: uxt
+define void @phi_feeding_phi_zeroext_args(i8 zeroext %a, i8 zeroext %b) {
+entry:
+  %0 = icmp ugt i8 %a, %b
+  br i1 %0, label %preheader, label %empty
+
+empty:
+  br label %preheader
+
+preheader:
+  %1 = phi i8 [ %a, %entry ], [ %b, %empty ]
+  br label %loop
+
+loop:
+  %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ]
+  %cmp = icmp ult i8 %val, 254
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %inc = sub nuw i8 %val, 2
+  br label %if.end
+
+if.else:
+  %inc1 = shl nuw i8 %val, 1
+  br label %if.end
+
+if.end:
+  %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
+  %cmp1 = icmp eq i8 %inc2, 255
+  br i1 %cmp1, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Just check that phis also work with i16s.
+; CHECK-COMMON-LABEL: phi_i16:
+; CHECK-COMMON-NOT:   uxt
+define void @phi_i16() {
+entry:
+  br label %loop
+
+loop:
+  %val = phi i16 [ 0, %entry ], [ %inc2, %if.end ]
+  %cmp = icmp ult i16 %val, 128
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %inc = add nuw i16 %val, 2
+  br label %if.end
+
+if.else:
+  %inc1 = add nuw i16 %val, 1
+  br label %if.end
+
+if.end:
+  %inc2 = phi i16 [ %inc, %if.then], [ %inc1, %if.else ]
+  %cmp1 = icmp ult i16 %inc2, 253
+  br i1 %cmp1, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-COMMON-LABEL: phi_feeding_switch
+; CHECK-COMMON: ldrb
+; CHECK-COMMON: uxtb
+; CHECK-COMMON-NOT: uxt
+define void @phi_feeding_switch(i8* %memblock, i8* %store, i16 %arg) {
+entry:
+  %pre = load i8, i8* %memblock, align 1
+  %conv = trunc i16 %arg to i8
+  br label %header
+
+header:
+  %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ]
+  %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ]
+  %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ]
+  switch i8 %phi.0, label %default [
+    i8 43, label %for.inc.i
+    i8 45, label %for.inc.i.i
+  ]
+
+for.inc.i:
+  %xor = xor i8 %phi.1, 1
+  br label %latch
+
+for.inc.i.i:
+  %and = and i8 %phi.1, 3
+  br label %latch
+
+default:
+  %sub = sub i8 %phi.0, 1
+  %cmp2 = icmp ugt i8 %sub, 4
+  br i1 %cmp2, label %latch, label %exit
+
+latch:
+  %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ]
+  %count = add nuw i8 %phi.2, 1
+  store i8 %count, i8* %store, align 1
+  br label %header
+
+exit:
+  ret void
+}
+
+; CHECK-COMMON-LABEL: ret_i8
+; CHECK-COMMON-NOT:   uxt
+define i8 @ret_i8() {
+entry:
+  br label %loop
+
+loop:
+  %val = phi i8 [ 0, %entry ], [ %inc2, %if.end ]
+  %cmp = icmp ult i8 %val, 128
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %inc = add nuw i8 %val, 2
+  br label %if.end
+
+if.else:
+  %inc1 = add nuw i8 %val, 1
+  br label %if.end
+
+if.end:
+  %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
+  %cmp1 = icmp ult i8 %inc2, 253
+  br i1 %cmp1, label %exit, label %loop
+
+exit:
+  ret i8 %inc2
+}
+
+; Check that %exp requires uxth in all cases, and will also be required to
+; promote %1 for the call - unless we can generate a uadd16.
+; CHECK-COMMON-LABEL: zext_load_sink_call:
+; CHECK-COMMON:       uxt
+; CHECK-DSP-IMM:      uadd16
+; CHECK-COMMON:       cmp
+; CHECK-DSP:          uxt
+; CHECK-DSP-IMM-NOT:  uxt
+define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) {
+entry:
+  %0 = load i16, i16* %ptr, align 4
+  %1 = add i16 %exp, 3
+  %cmp = icmp eq i16 %0, %exp
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %conv0 = zext i16 %0 to i32
+  %conv1 = zext i16 %1 to i32
+  %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1)
+  br label %exit
+
+exit:
+  %exitval = phi i32 [ %call, %if.then ], [ 0, %entry  ]
+  ret i32 %exitval
+}
+
+
+; Check that the pass doesn't try to promote the immediate parameters.
+; CHECK-COMMON-LABEL: call_with_imms
+; CHECK-COMMON-NOT:   uxt
+define i8 @call_with_imms(i8* %arg) {
+  %call = tail call arm_aapcs_vfpcc zeroext i8 @dummy2(i8* nonnull %arg, i8 zeroext 0, i8 zeroext 0)
+  %cmp = icmp eq i8 %call, 0
+  %res = select i1 %cmp, i8 %call, i8 1
+  ret i8 %res
+}
+
+; Test that the call result is still extended.
+; CHECK-COMMON-LABEL: test_call:
+; CHECK-COMMON: bl
+; CHECK-COMMONNEXT: sxtb r1, r0
+define i16 @test_call(i8 zeroext %arg) {
+  %call = call i8 @dummy_i8(i8 %arg)
+  %cmp = icmp ult i8 %call, 128
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv 
+}
+
+; Test that the transformation bails when it finds that i16 is larger than i8.
+; TODO: We should be able to remove the uxtb in these cases.
+; CHECK-LABEL: promote_i8_sink_i16_1
+; CHECK-COMMON: bl dummy_i8
+; CHECK-COMMON: adds r0, #1
+; CHECK-COMMON: uxtb r0, r0
+; CHECK-COMMON: cmp r0
+define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroext %arg2) {
+  %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
+  %add = add nuw i8 %call, 1
+  %conv = zext i8 %add to i16
+  %cmp = icmp ne i16 %conv, %arg1
+  %sel = select i1 %cmp, i16 %arg1, i16 %arg2
+  %res = tail call zeroext i16 @dummy3(i16 %sel)
+  ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: promote_i8_sink_i16_2
+; CHECK-COMMON: bl dummy_i8
+; CHECK-COMMON: adds r0, #1
+; CHECK-COMMON: uxtb r0, r0
+; CHECK-COMMON: cmp r0
+define i16 @promote_i8_sink_i16_2(i8 zeroext %arg0, i8 zeroext %arg1, i16 zeroext %arg2) {
+  %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
+  %add = add nuw i8 %call, 1
+  %cmp = icmp ne i8 %add, %arg1
+  %conv = zext i8 %arg1 to i16
+  %sel = select i1 %cmp, i16 %conv, i16 %arg2
+  %res = tail call zeroext i16 @dummy3(i16 %sel)
+  ret i16 %res
+}
+
+@uc = global i8 42, align 1
+@LL = global i64 0, align 8
+
+; CHECK-COMMON-LABEL: zext_i64
+; CHECK-COMMON: ldrb
+; CHECK-COMMON: strd
+define void @zext_i64() {
+entry:
+  %0 = load i8, i8* @uc, align 1
+  %conv = zext i8 %0 to i64
+  store i64 %conv, i64* @LL, align 8
+  %cmp = icmp eq i8 %0, 42
+  %conv1 = zext i1 %cmp to i32
+  %call = tail call i32 bitcast (i32 (...)* @assert to i32 (i32)*)(i32 %conv1)
+  ret void
+}
+
+@a = global i16* null, align 4
+@b = global i32 0, align 4
+
+; CHECK-COMMON-LABEL: constexpr
+; CHECK-COMMON: uxth
+define i32 @constexpr() {
+entry:
+  store i32 ptrtoint (i32* @b to i32), i32* @b, align 4
+  %0 = load i16*, i16** @a, align 4
+  %1 = load i16, i16* %0, align 2
+  %or = or i16 %1, ptrtoint (i32* @b to i16)
+  store i16 %or, i16* %0, align 2
+  %cmp = icmp ne i16 %or, 4
+  %conv3 = zext i1 %cmp to i32
+  %call = tail call i32 bitcast (i32 (...)* @e to i32 (i32)*)(i32 %conv3) #2
+  ret i32 undef
+}
+
+; Check that d.sroa.0.0.be is promoted passed directly into the tail call.
+; CHECK-COMMON-LABEL: check_zext_phi_call_arg
+; CHECK-COMMON-NOT: uxt
+define i32 @check_zext_phi_call_arg() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.backedge, %entry
+  %d.sroa.0.0 = phi i16 [ 30, %entry ], [ %d.sroa.0.0.be, %for.cond.backedge ]
+  %tobool = icmp eq i16 %d.sroa.0.0, 0
+  br i1 %tobool, label %for.cond.backedge, label %if.then
+
+for.cond.backedge:                                ; preds = %for.cond, %if.then
+  %d.sroa.0.0.be = phi i16 [ %call, %if.then ], [ 0, %for.cond ]
+  br label %for.cond
+
+if.then:                                          ; preds = %for.cond
+  %d.sroa.0.0.insert.ext = zext i16 %d.sroa.0.0 to i32
+  %call = tail call zeroext i16 bitcast (i16 (...)* @f to i16 (i32)*)(i32 %d.sroa.0.0.insert.ext) #2
+  br label %for.cond.backedge
+}
+
+
+; The call to safe_lshift_func takes two parameters, but they're the same value just one is zext.
+; CHECK-COMMON-LABEL: call_zext_i8_i32
+define fastcc i32 @call_zext_i8_i32(i32 %p_45, i8 zeroext %p_46) {
+for.cond8.preheader:
+  %call217 = call fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 zeroext undef)
+  %tobool219 = icmp eq i8 %call217, 0
+  br i1 %tobool219, label %for.end411, label %for.cond273.preheader
+
+for.cond273.preheader:                            ; preds = %for.cond8.preheader
+  %call217.lcssa = phi i8 [ %call217, %for.cond8.preheader ]
+  %conv218.le = zext i8 %call217.lcssa to i32
+  %call346 = call fastcc zeroext i8 @safe_lshift_func(i8 zeroext %call217.lcssa, i32 %conv218.le)
+  unreachable
+
+for.end411:                                       ; preds = %for.cond8.preheader
+  %call452 = call fastcc i64 @safe_sub_func_int64_t_s_s(i64 undef, i64 4)
+  unreachable
+}
+
+%struct.anon = type { i32 }
+
+@g_57 = hidden local_unnamed_addr global %struct.anon zeroinitializer, align 4
+@g_893 = hidden local_unnamed_addr global %struct.anon zeroinitializer, align 4
+@g_82 = hidden local_unnamed_addr global i32 0, align 4
+
+; Test that the transform bails on finding a call which returns a i16**
+; CHECK-COMMON-LABEL: call_return_pointer
+; CHECK-COMMON: sxth
+; CHECK-COMMON-NOT: uxt
+define hidden i32 @call_return_pointer(i8 zeroext %p_13) local_unnamed_addr #0 {
+entry:
+  %conv1 = zext i8 %p_13 to i16
+  %call = tail call i16** @func_62(i8 zeroext undef, i32 undef, i16 signext %conv1, i32* undef)
+  %0 = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @g_893, i32 0, i32 0), align 4
+  %conv2 = trunc i32 %0 to i16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.backedge, %entry
+  %p_13.addr.0 = phi i8 [ %p_13, %entry ], [ %p_13.addr.0.be, %for.cond.backedge ]
+  %tobool = icmp eq i8 %p_13.addr.0, 0
+  br i1 %tobool, label %for.cond.backedge, label %if.then
+
+for.cond.backedge:                                ; preds = %for.cond, %if.then
+  %p_13.addr.0.be = phi i8 [ %conv4, %if.then ], [ 0, %for.cond ]
+  br label %for.cond
+
+if.then:                                          ; preds = %for.cond
+  %call3 = tail call fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %conv2)
+  %conv4 = trunc i16 %call3 to i8
+  br label %for.cond.backedge
+}
+
+declare noalias i16** @func_62(i8 zeroext %p_63, i32 %p_64, i16 signext %p_65, i32* nocapture readnone %p_66)
+declare fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %si2)
+declare dso_local fastcc i64 @safe_sub_func_int64_t_s_s(i64, i64)
+declare dso_local fastcc zeroext i8 @safe_lshift_func(i8 zeroext, i32)
+declare dso_local fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 returned zeroext)
+
+declare dso_local i32 @e(...) local_unnamed_addr #1
+declare dso_local zeroext i16 @f(...) local_unnamed_addr #1
+
+declare i32 @dummy(i32, i32)
+declare i8 @dummy_i8(i8)
+declare i8 @dummy2(i8*, i8, i8)
+declare i16 @dummy3(i16)
+declare i32 @assert(...)
Index: llvm/trunk/test/CodeGen/ARM/arm-cgp-signed.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/arm-cgp-signed.ll
+++ llvm/trunk/test/CodeGen/ARM/arm-cgp-signed.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple=thumbv7m %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7 %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv8 %s -o - | FileCheck %s
+
+; Test to check that ARMCodeGenPrepare doesn't optimised away sign extends.
+; CHECK-LABEL: test_signed_load:
+; CHECK: uxth
+define i16 @test_signed_load(i16* %ptr) {
+  %load = load i16, i16* %ptr
+  %conv0 = zext i16 %load to i32
+  %conv1 = sext i16 %load to i32
+  %cmp = icmp eq i32 %conv0, %conv1
+  %conv2 = zext i1 %cmp to i16
+  ret i16 %conv2
+}
+
+; Don't allow sign bit generating opcodes.
+; CHECK-LABEL: test_ashr:
+; CHECK: sxth
+define i16 @test_ashr(i16 zeroext %arg) {
+  %ashr = ashr i16 %arg, 1
+  %cmp = icmp eq i16 %ashr, 0
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv 
+}
+
+; CHECK-LABEL: test_sdiv:
+; CHECK: sxth
+define i16 @test_sdiv(i16 zeroext %arg) {
+  %sdiv = sdiv i16 %arg, 2
+  %cmp = icmp ne i16 %sdiv, 0
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv 
+}
+
+; CHECK-LABEL: test_srem
+; CHECK: sxth
+define i16 @test_srem(i16 zeroext %arg) {
+  %srem = srem i16 %arg, 4
+  %cmp = icmp ne i16 %srem, 0
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv 
+}
+