Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -35,6 +35,8 @@
 enum ID : unsigned;
 }
 
+class AssumptionCache;
+class BranchInst;
 class Function;
 class GlobalValue;
 class IntrinsicInst;
@@ -44,6 +46,7 @@
 class ScalarEvolution;
 class StoreInst;
 class SwitchInst;
+class TargetLibraryInfo;
 class Type;
 class User;
 class Value;
@@ -445,6 +448,25 @@
   void getUnrollingPreferences(Loop *L, ScalarEvolution &,
                                UnrollingPreferences &UP) const;
 
+  struct HardwareLoopInfo {
+    HardwareLoopInfo(Loop *L) : L(L) { }
+    Loop *L                 = nullptr;
+    BasicBlock *ExitBlock   = nullptr;
+    BranchInst *ExitBranch  = nullptr;
+    const SCEV *ExitCount   = nullptr;
+    Instruction *Predicate  = nullptr;
+    IntegerType *CountType  = nullptr;
+    bool PerformTest        = false;
+    bool IsNestingLegal     = false;
+    bool InsertPHICounter   = false;
+    unsigned NumElements    = 1;
+  };
+
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                HardwareLoopInfo &HWLoopInfo) const;
+
   /// @}
 
   /// \name Scalar Target Information
@@ -1073,6 +1095,10 @@
   virtual bool isLoweredToCall(const Function *F) = 0;
   virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
                                        UnrollingPreferences &UP) = 0;
+  virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                        AssumptionCache &AC,
+                                        TargetLibraryInfo *LibInfo,
+                                        HardwareLoopInfo &HWLoopInfo) = 0;
   virtual bool isLegalAddImmediate(int64_t Imm) = 0;
   virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
   virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@@ -1304,6 +1330,12 @@
                                UnrollingPreferences &UP) override {
     return Impl.getUnrollingPreferences(L, SE, UP);
   }
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                HardwareLoopInfo &HWLoopInfo) override {
+    return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
+  }
   bool isLegalAddImmediate(int64_t Imm) override {
     return Impl.isLegalAddImmediate(Imm);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -190,6 +190,13 @@
     return true;
   }
 
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                TTI::HardwareLoopInfo &HWLoopInfo) {
+    return false;
+  }
+
   void getUnrollingPreferences(Loop *, ScalarEvolution &,
                                TTI::UnrollingPreferences &) {}
 
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -485,6 +485,13 @@
     UP.BEInsns = 2;
   }
 
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                TTI::HardwareLoopInfo &HWLoopInfo) {
+    return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
+  }
+
   int getInstructionLatency(const Instruction *I) {
     if (isa<LoadInst>(I))
       return getST()->getSchedModel().DefaultLoadLatency;
Index: include/llvm/CodeGen/Passes.h
===================================================================
--- include/llvm/CodeGen/Passes.h
+++ include/llvm/CodeGen/Passes.h
@@ -446,6 +446,8 @@
   /// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp
   FunctionPass *createCFIInstrInserter();
 
+  FunctionPass *createHardwareLoops();
+
 } // End llvm namespace
 
 #endif
Index: include/llvm/IR/Intrinsics.td
===================================================================
--- include/llvm/IR/Intrinsics.td
+++ include/llvm/IR/Intrinsics.td
@@ -1180,6 +1180,25 @@
                                                     [llvm_anyvector_ty],
                                                     [IntrNoMem]>;
 
+def int_set_loop_iterations :
+  Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>;
+
+def int_test_set_loop_iterations :
+  Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>;
+
+def int_set_loop_elements :
+  Intrinsic<[], [llvm_anyint_ty, llvm_anyint_ty], [IntrNoDuplicate]>;
+
+def int_test_set_loop_elements :
+  Intrinsic<[llvm_i1_ty], [llvm_anyint_ty, llvm_anyint_ty], [IntrNoDuplicate]>;
+
+def int_loop_dec :
+  Intrinsic<[llvm_anyint_ty],
+            [llvm_anyint_ty, llvm_anyint_ty], [IntrNoDuplicate]>;
+
+def int_get_active_mask_4 :
+  Intrinsic<[llvm_v4i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>;
+
 //===----- Intrinsics that are used to provide predicate information -----===//
 
 def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -162,6 +162,7 @@
 void initializeGlobalSplitPass(PassRegistry&);
 void initializeGlobalsAAWrapperPassPass(PassRegistry&);
 void initializeGuardWideningLegacyPassPass(PassRegistry&);
+void initializeHardwareLoopsPass(PassRegistry&);
 void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
 void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
 void initializeIPCPPass(PassRegistry&);
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -135,6 +135,12 @@
   return TTIImpl->getUnrollingPreferences(L, SE, UP);
 }
 
+bool TargetTransformInfo::isHardwareLoopProfitable(
+  Loop *L, ScalarEvolution &SE, AssumptionCache &AC,
+  TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const {
+  return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
+}
+
 bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
   return TTIImpl->isLegalAddImmediate(Imm);
 }
Index: lib/CodeGen/CMakeLists.txt
===================================================================
--- lib/CodeGen/CMakeLists.txt
+++ lib/CodeGen/CMakeLists.txt
@@ -33,6 +33,7 @@
   GCRootLowering.cpp
   GCStrategy.cpp
   GlobalMerge.cpp
+  HardwareLoops.cpp
   IfConversion.cpp
   ImplicitNullChecks.cpp
   IndirectBrExpandPass.cpp
Index: lib/CodeGen/HardwareLoops.cpp
===================================================================
--- /dev/null
+++ lib/CodeGen/HardwareLoops.cpp
@@ -0,0 +1,483 @@
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hardware-loops"
+
+#define HW_LOOPS_NAME "Hardware Loop Insertion"
+
+STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
+
+namespace {
+
+  using TTI = TargetTransformInfo;
+
+  class HardwareLoops : public FunctionPass {
+  public:
+    static char ID;
+
+    HardwareLoops() : FunctionPass(ID) {
+      initializeHardwareLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
+    }
+
+    bool TryConvertLoop(Loop *L);
+    bool TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo);
+    void ConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo);
+
+  private:
+    ScalarEvolution *SE = nullptr;
+    LoopInfo *LI = nullptr;
+    const DataLayout *DL = nullptr;
+    const TargetTransformInfo *TTI = nullptr;
+    DominatorTree *DT = nullptr;
+    bool PreserveLCSSA = false;
+    AssumptionCache *AC = nullptr;
+    TargetLibraryInfo *LibInfo = nullptr;
+    Module *M = nullptr;
+  };
+}
+
+char HardwareLoops::ID = 0;
+
+bool HardwareLoops::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  DL = &F.getParent()->getDataLayout();
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
+  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  M = F.getParent();
+
+  bool MadeChange = false;
+
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
+    Loop *L = *I;
+
+    if (!L->getParentLoop())
+      MadeChange |= TryConvertLoop(L);
+  }
+
+  return MadeChange;
+}
+
+bool HardwareLoops::TryConvertLoop(Loop *L) {
+  bool MadeChange = false;
+
+  // Process nested loops first.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+    MadeChange |= TryConvertLoop(*I);
+  }
+
+  if (MadeChange)
+    return true;
+
+  // Bail out if the loop has irreducible control flow.
+  LoopBlocksRPO RPOT(L);
+  RPOT.perform(LI);
+  if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
+    LLVM_DEBUG(dbgs() << "HWLoops: Loop contains irreducible CFG.\n");
+    return false;
+  }
+
+  TTI::HardwareLoopInfo HWLoopInfo(L);
+  if (!TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo)) {
+    LLVM_DEBUG(dbgs() << "HWLoops: Not profitable to convert loop.\n");
+    return MadeChange;
+  }
+
+  MadeChange |= TryConvertLoop(HWLoopInfo);
+  return MadeChange;
+}
+
+bool HardwareLoops::TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo) {
+
+  Loop *L = HWLoopInfo.L;
+  //BasicBlock *CountedExitBlock = nullptr;
+  //const SCEV *ExitCount = nullptr;
+  //BranchInst *CountedExitBranch = nullptr;
+  SmallVector<BasicBlock*, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable do loop: "
+             << *L << "\n");
+
+  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
+       IE = ExitingBlocks.end(); I != IE; ++I) {
+    const SCEV *EC = SE->getExitCount(L, *I);
+    LLVM_DEBUG(dbgs() << "HWLoops: Exit Count for " << *L << " from block "
+                      << (*I)->getName() << ": " << *EC << "\n");
+    if (isa<SCEVCouldNotCompute>(EC))
+      continue;
+    if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
+      if (ConstEC->getValue()->isZero())
+        continue;
+    } else if (!SE->isLoopInvariant(EC, L))
+      continue;
+
+    if (SE->getTypeSizeInBits(EC->getType()) >
+        HWLoopInfo.CountType->getBitWidth())
+      continue;
+
+    // If this exiting block is contained in a nested loop, it is not eligible
+    // for insertion of the branch-and-decrement since the inner loop would
+    // end up messing up the value in the CTR.
+    if (!HWLoopInfo.IsNestingLegal && LI->getLoopFor(*I) != L)
+      continue;
+
+    // We now have a loop-invariant count of loop iterations (which is not the
+    // constant zero) for which we know that this loop will not exit via this
+    // existing block.
+
+    // We need to make sure that this block will run on every loop iteration.
+    // For this to be true, we must dominate all blocks with backedges. Such
+    // blocks are in-loop predecessors to the header block.
+    bool NotAlways = false;
+    for (pred_iterator PI = pred_begin(L->getHeader()),
+         PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
+      if (!L->contains(*PI))
+        continue;
+
+      if (!DT->dominates(*I, *PI)) {
+        NotAlways = true;
+        break;
+      }
+    }
+
+    if (NotAlways)
+      continue;
+
+    // Make sure this blocks ends with a conditional branch.
+    Instruction *TI = (*I)->getTerminator();
+    if (!TI)
+      continue;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional())
+        continue;
+
+      HWLoopInfo.ExitBranch = BI;
+    } else
+      continue;
+
+    // Note that this block may not be the loop latch block, even if the loop
+    // has a latch block.
+    HWLoopInfo.ExitBlock = *I;
+    HWLoopInfo.ExitCount = EC;
+    break;
+  }
+
+  if (!HWLoopInfo.ExitBlock) {
+    LLVM_DEBUG(dbgs() << "HWLoops: Unable to find CountExitBlock.\n");
+    return false;
+  }
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // If we don't have a preheader, then insert one. If we already have a
+  // preheader, then we can use it (except if the preheader contains a use of
+  // the CTR register because some such uses might be reordered by the
+  // selection DAG after the mtctr instruction).
+  if (!Preheader)// || mightUseCTR(Preheader))
+    Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
+  if (!Preheader)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName()
+                    << "\n");
+
+
+  ConvertLoop(HWLoopInfo);
+  LLVM_DEBUG(dbgs() << "Converted Loop: " << *L << "\n");
+  ++NumHWLoops;
+  return true;
+}
+
+static const SCEV* CalcTotalElts(ConstantInt *Factor,
+                                 const SCEV *TripCount,
+                                 ScalarEvolution &SE) {
+  if (Factor->equalsInt(1))
+    return TripCount;
+
+  const SCEV *FactorSCEV = SE.getSCEV(Factor);
+  IntegerType *Int32Ty = Factor->getType();
+
+  if (auto *Count = dyn_cast<SCEVConstant>(TripCount)) {
+    const SCEV *Elts = SE.getMulExpr(TripCount, FactorSCEV);
+    unsigned Rem = Count->getAPInt().urem(Factor->getZExtValue());
+    if (Rem == 0)
+      return Elts;
+    else
+      return SE.getAddExpr(Elts, SE.getSCEV(ConstantInt::get(Int32Ty, Rem)));
+  }
+
+  auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* {
+    LLVM_DEBUG(dbgs() << "ARM HWLOOPS: VisitAdd " << *S << "\n");
+    if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
+      if (Const->getAPInt() != -Factor->getValue())
+        return nullptr;
+    } else
+      return nullptr;
+    return dyn_cast<SCEVMulExpr>(S->getOperand(1));
+  };
+
+  auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* {
+    LLVM_DEBUG(dbgs() << "ARM HWLOOPS: VisitMul " << *S << "\n");
+    if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
+      if (Const->getValue() != Factor)
+        return nullptr;
+    } else
+      return nullptr;
+    return dyn_cast<SCEVUDivExpr>(S->getOperand(1));
+  };
+
+  auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* {
+    LLVM_DEBUG(dbgs() << "ARM HWLOOPS: VisitDiv " << *S << "\n");
+    if (auto *Const = dyn_cast<SCEVConstant>(S->getRHS())) {
+      if (Const->getValue() != Factor)
+        return nullptr;
+    } else
+      return nullptr;
+
+    if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) {
+      if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) {
+        if (Const->getAPInt() != (Factor->getValue() - 1))
+          return nullptr;
+      } else
+        return nullptr;
+
+      LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Elements: "
+                 << *RoundUp->getOperand(1) << "\n");
+      return RoundUp->getOperand(1);
+    }
+    return nullptr;
+  };
+
+  // (1 + ((-4 + (4 * ((3 + %N) /u 4))<nuw>) /u 4))<nuw><nsw>
+  if (auto *TC = dyn_cast<SCEVAddExpr>(TripCount))
+    if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1)))
+      if (auto *Add = dyn_cast<SCEVAddExpr>(Div->getLHS()))
+        if (auto *Mul = VisitAdd(Add))
+          if (auto *Div = VisitMul(Mul))
+            if (auto *Elts = VisitDiv(Div))
+              return Elts;
+
+  return nullptr;        
+}
+
+// Insert the count into the preheader and replace the condition used by the
+// selected branch.
+void HardwareLoops::ConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo) {
+
+  auto InitLoopCount = [this](TTI::HardwareLoopInfo &HWLoopInfo,
+                              BasicBlock *BB) {
+    const SCEV *ExitCount = HWLoopInfo.ExitCount;
+
+    Type *CountType = HWLoopInfo.CountType;
+    SCEVExpander SCEVE(*SE, *DL, "loopcnt");
+    if (!ExitCount->getType()->isPointerTy() &&
+        ExitCount->getType() != CountType)
+      ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
+
+    ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
+
+    if (HWLoopInfo.Predicate) {
+      ConstantInt *Factor = cast<ConstantInt>(
+        ConstantInt::get(ExitCount->getType(), HWLoopInfo.NumElements));
+      ExitCount = CalcTotalElts(Factor, ExitCount, *SE);
+    }
+
+    return SCEVE.expandCodeFor(ExitCount, CountType, BB->getTerminator());
+  };
+
+  auto InsertIterationSetup = [this](TTI::HardwareLoopInfo &HWLoopInfo,
+                                     Value *LoopCountInit, BasicBlock *BB) {
+    IRBuilder<> Builder(BB->getTerminator());
+    Type *Ty = LoopCountInit->getType();
+
+    if (HWLoopInfo.PerformTest) {
+      Function *LoopIter =
+        Intrinsic::getDeclaration(M, Intrinsic::test_set_loop_iterations,
+                                  { Ty, Ty });
+      Value *Call = Builder.CreateCall(LoopIter, LoopCountInit);
+      LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop setup: " << *Call << "\n");
+
+      auto *LoopGuard = dyn_cast<BranchInst>(BB->getTerminator());
+      assert((LoopGuard && LoopGuard->isConditional()) &&
+             "Expected conditional branch for while loop");
+      //Value *Cmp = Builder.CreateICmpNE(Call, ConstantInt::get(Ty, 0));
+      LoopGuard->setCondition(Call);
+
+      if (LoopGuard->getSuccessor(0) != HWLoopInfo.L->getLoopPreheader())
+        LoopGuard->swapSuccessors();
+    } else {
+      Function *LoopIter =
+        Intrinsic::getDeclaration(M, Intrinsic::set_loop_iterations, Ty);
+      Builder.CreateCall(LoopIter, LoopCountInit);
+    }
+  };
+
+  auto InsertElementSetup = [this](TTI::HardwareLoopInfo &HWLoopInfo,
+                                   Value *NumElts, BasicBlock *BB) {
+    Type *Ty = HWLoopInfo.CountType;
+    IRBuilder<> Builder(BB->getTerminator());
+    Value *Ops[] = { NumElts, ConstantInt::get(Ty, HWLoopInfo.NumElements) };
+
+
+    if (HWLoopInfo.PerformTest) {
+      Function *Setup =
+        Intrinsic::getDeclaration(M, Intrinsic::test_set_loop_elements,
+                                  { Ty, Ty });
+      Instruction *Call = Builder.CreateCall(Setup, Ops);
+      LLVM_DEBUG(dbgs() << "HWLoops: Insert loop elements: " << *Call << "\n");
+
+      auto *LoopGuard = dyn_cast<BranchInst>(BB->getTerminator());
+      assert((LoopGuard && LoopGuard->isConditional()) &&
+             "Expected conditional branch for while loop");
+      //Value *Cmp = Builder.CreateICmpNE(Call, ConstantInt::get(Ty, 0));
+      LoopGuard->setCondition(Call);
+
+      if (LoopGuard->getSuccessor(0) != HWLoopInfo.L->getLoopPreheader())
+        LoopGuard->swapSuccessors();
+    } else {
+      Function *Setup =
+        Intrinsic::getDeclaration(M, Intrinsic::set_loop_elements,
+                                  { Ty, Ty });
+      Builder.CreateCall(Setup, Ops);
+    }
+  };
+
+  auto InsertCounterPHI = [](TTI::HardwareLoopInfo &HWLoopInfo,
+                             Value *NumElts, Value *EltsRem) {
+    BasicBlock *Preheader = HWLoopInfo.L->getLoopPreheader();
+    BasicBlock *Header = HWLoopInfo.L->getHeader();
+    BasicBlock *Latch = HWLoopInfo.ExitBranch->getParent();
+    IRBuilder<> Builder(Header->getFirstNonPHI());
+    PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
+    Index->addIncoming(NumElts, Preheader);
+    Index->addIncoming(EltsRem, Latch);
+    LLVM_DEBUG(dbgs() << "HWLoops: Index PHI: " << *Index << "\n");
+    return Index;
+  };
+
+  auto InsertDec = [this](TTI::HardwareLoopInfo &HWLoopInfo, Value *NumElts) {
+    BranchInst *ExitBranch = HWLoopInfo.ExitBranch;
+    IRBuilder<> CondBuilder(ExitBranch);
+    Value *Factor = ConstantInt::get(NumElts->getType(),
+                                     HWLoopInfo.NumElements);
+    Function *DecFunc =
+        Intrinsic::getDeclaration(M, Intrinsic::loop_dec,
+                                  { NumElts->getType(), NumElts->getType(),
+                                    Factor->getType()});
+    Value *Ops[] = { NumElts, Factor };
+    Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
+    Value *NewCond =
+      CondBuilder.CreateICmpNE(Call,
+                               ConstantInt::get(NumElts->getType(), 0));
+    Value *OldCond = ExitBranch->getCondition();
+    ExitBranch->setCondition(NewCond);
+
+    // The false branch must exit the loop.
+    if (!HWLoopInfo.L->contains(ExitBranch->getSuccessor(0)))
+      ExitBranch->swapSuccessors();
+
+    // The old condition may be dead now, and may have even created a dead PHI
+    // (the original induction variable).
+    RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+
+    LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
+    return cast<Instruction>(Call);
+  };
+
+  auto InsertActiveMask = [this](TTI::HardwareLoopInfo &HWLoopInfo,
+                                 Value *Elts) {
+    IRBuilder<> Builder(HWLoopInfo.Predicate);
+    Function *F =
+      Intrinsic::getDeclaration(M, Intrinsic::get_active_mask_4, Elts->getType());
+    Value *Ops[] = { Elts };
+    Instruction *ActiveMask = Builder.CreateCall(F, Ops);
+    LLVM_DEBUG(dbgs() << "HWLoops: Active Lane Mask: " << *ActiveMask << "\n");
+    HWLoopInfo.Predicate->replaceAllUsesWith(ActiveMask);
+  };
+
+  BasicBlock *BeginBB = HWLoopInfo.PerformTest ?
+    HWLoopInfo.L->getLoopPreheader()->getUniquePredecessor() :
+    HWLoopInfo.L->getLoopPreheader();
+
+  Value *LoopCountInit = InitLoopCount(HWLoopInfo, BeginBB);
+  Value *EltsRem = LoopCountInit;
+
+  if (HWLoopInfo.Predicate) {
+    InsertElementSetup(HWLoopInfo, LoopCountInit, BeginBB);
+  } else
+    InsertIterationSetup(HWLoopInfo, LoopCountInit, BeginBB);
+
+  Instruction *LoopDec = InsertDec(HWLoopInfo, EltsRem);
+  if (HWLoopInfo.InsertPHICounter) {
+    EltsRem = InsertCounterPHI(HWLoopInfo, LoopCountInit, LoopDec);
+    LoopDec->setOperand(0, EltsRem);
+  }
+  if (HWLoopInfo.Predicate)
+    InsertActiveMask(HWLoopInfo, EltsRem);
+
+  // Run through the basic blocks of the loop and see if any of them have dead
+  // PHIs that can be removed.
+  for (auto I : HWLoopInfo.L->blocks())
+    DeleteDeadPHIs(I);
+}
+
+INITIALIZE_PASS_BEGIN(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
+
+FunctionPass *llvm::createHardwareLoops() { return new HardwareLoops(); }
Index: lib/Target/ARM/ARM.h
===================================================================
--- lib/Target/ARM/ARM.h
+++ lib/Target/ARM/ARM.h
@@ -37,6 +37,7 @@
 
 
 Pass *createARMParallelDSPPass();
+FunctionPass *createARMFinaliseHardwareLoopsPass();
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 FunctionPass *createA15SDOptimizerPass();
Index: lib/Target/ARM/ARMFinalizeHardwareLoops.cpp
===================================================================
--- /dev/null
+++ lib/Target/ARM/ARMFinalizeHardwareLoops.cpp
@@ -0,0 +1,256 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-finalise-hardware-loops"
+#define ARM_FINALISE_HW_LOOPS_NAME "ARM hardware loop finalisation pass"
+
+namespace {
+
+  class ARMFinaliseHWLoops : public MachineFunctionPass {
+    const ARMBaseInstrInfo    *TII = nullptr;
+
+  public:
+    static char ID;
+
+    ARMFinaliseHWLoops() : MachineFunctionPass(ID) { }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    bool ProcessLoop(MachineLoop *ML);
+
+    void Expand(MachineInstr *Start, MachineInstr *Dec, MachineInstr *End,
+                MachineInstr *ActiveMask,
+                SmallVectorImpl<MachineInstr*> &Predicated);
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override {
+      return ARM_FINALISE_HW_LOOPS_NAME;
+    }
+  };
+}
+  
+char ARMFinaliseHWLoops::ID = 0;
+
+bool ARMFinaliseHWLoops::runOnMachineFunction(MachineFunction &MF) {
+  auto &MLI = getAnalysis<MachineLoopInfo>();
+  TII =
+    static_cast<const ARMBaseInstrInfo*>(MF.getSubtarget().getInstrInfo());
+  LLVM_DEBUG(dbgs() << " ------- ARM HWLOOPS on " << MF.getName() << "\n");
+
+  bool Changed = false;
+  for (auto ML : MLI) {
+    if (!ML->getExitingBlock() || !ML->getHeader() || !ML->getLoopLatch())
+      continue;
+    Changed |= ProcessLoop(ML);
+  }
+  return Changed;
+}
+
+bool ARMFinaliseHWLoops::ProcessLoop(MachineLoop *ML) {
+
+  LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Processing " << *ML);
+  auto SearchForStart = [](MachineBasicBlock *MBB) -> MachineInstr* {
+    for (auto &MI : *MBB) {
+      if (MI.getOpcode() == ARM::t2LoopStart)
+        return &MI;
+    }
+    return nullptr;
+  };
+
+  MachineInstr *Start = nullptr;
+
+  if (auto *Preheader = ML->getLoopPreheader()) {
+    Start = SearchForStart(Preheader);
+    if (!Start) {
+      if (Preheader->pred_size() == 1) {
+        MachineBasicBlock *PrePreheader = *Preheader->pred_begin();
+        Start = SearchForStart(PrePreheader);
+      }
+    }
+  }
+
+  if (!Start)
+    return false;
+  LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Found Loop Start: " << *Start);
+
+  auto IsLoopDec = [](MachineInstr &MI) {
+    return MI.getOpcode() == ARM::t2LoopDec;
+  };
+
+  auto IsLoopEnd = [](MachineInstr &MI) {
+    return MI.getOpcode() == ARM::t2LoopEnd;
+  };
+
+  auto IsActiveMask = [](MachineInstr &MI) {
+    return MI.getOpcode() == ARM::t2ActiveMask;
+  };
+
+  auto IsPredicated = [](MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    default:
+      break;
+    case ARM::VMSTR32:
+    case ARM::VMLDR32:
+      return true;
+    }
+    return false;
+  };
+
+  MachineInstr *Dec = nullptr;
+  MachineInstr *End = nullptr;
+  MachineInstr *ActiveMask = nullptr;
+  bool FoundPredicated = false;
+  bool IsProfitable = true;
+  SmallVector<MachineInstr*, 4> Predicated;
+
+  for (auto *MBB : ML->getBlocks()) {
+    for (auto &MI : *MBB) {
+      // TODO: For scalar loops, check for any instructions that means a
+      // low-overhead loop wouldn't be profitable. Should we bail if LR has
+      // been spilt? We'd still need a register to control the loop count but
+      // the loop index may increase whereas LE(TP) decrement it...
+      //
+      // Not inserting a low-overhead loop for a vector loop is not really
+      // option here as we'd either:
+      // - Need to reconstruct a vector loop and a scalar epilogue.
+      // - Try to use VIDUP and create a VPT block to predicate the lanes,
+      //   which would require using a Q register, all of which may be already
+      //   allocated, for the VIDUP result. It looks like VIDUP wouldn't even be
+      //   helpful for 16xi8 vectors because the instruction can only increment
+      //   by a maximum of 8.
+
+      if (IsLoopDec(MI))
+        Dec = &MI;
+      else if (IsLoopEnd(MI))
+        End = &MI;
+      else if (IsActiveMask(MI))
+        ActiveMask = &MI;
+      else if (IsPredicated(MI)) {
+        FoundPredicated = true;
+        Predicated.push_back(&MI);
+      }
+    }
+  }
+
+  // Check that we've found the necessary components
+  if (!Dec || !End || (FoundPredicated && !ActiveMask))
+    return false;
+
+  if (!IsProfitable)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Found Loop Dec: " << *Dec);
+  LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Found Loop End: " << *End);
+
+  // TODO: Verify that the cmp and br from the WLS either branch to the header
+  // or the exit block.
+  // TODO: Verify that the cmp and br from the LE either branch to the header
+  // or the exit block.
+  // TODO: Verify that all predicated instructions are using ActiveMask.
+
+  Expand(Start, Dec, End, ActiveMask, Predicated);
+  return true;
+}
+
+void ARMFinaliseHWLoops::Expand(MachineInstr *Start, MachineInstr *Dec,
+                                MachineInstr *End, MachineInstr *ActiveMask,
+                                SmallVectorImpl<MachineInstr*> &Predicated) {
+  auto ExpandLoopStart = [this](MachineInstr *Start) {
+    MachineBasicBlock &MBB = *Start->getParent();
+    MachineInstrBuilder MIB = BuildMI(MBB, Start, Start->getDebugLoc(),
+                                      TII->get(ARM::t2WLSTP));
+    MIB.addDef(ARM::LR);
+    unsigned OpIdx = 0;
+    MIB.add(Start->getOperand(OpIdx++));
+    MIB.add(Start->getOperand(OpIdx++));
+    MIB.add(Start->getOperand(OpIdx++));
+    MIB.add(predOps(ARMCC::AL));
+    LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Inserted WLSTP: " << *MIB << "\n");
+    Start->eraseFromParent();
+  };
+
+  auto ExpandLoad = [this](MachineInstr *MI) {
+    MachineBasicBlock &MBB = *MI->getParent();
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
+                                      TII->get(ARM::t2VLDRW));
+    unsigned OpIdx = 0;
+    MIB.add(MI->getOperand(OpIdx++));
+    MIB.add(MI->getOperand(OpIdx++));
+    MIB.add(predOps(ARMCC::AL));
+    MI->eraseFromParent();
+  };
+
+  auto ExpandStore = [this](MachineInstr *MI) {
+    MachineBasicBlock &MBB = *MI->getParent();
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
+                                      TII->get(ARM::t2VSTRW));
+    unsigned OpIdx = 0;
+    MIB.add(MI->getOperand(OpIdx++));
+    MIB.add(MI->getOperand(OpIdx++));
+    MIB.add(predOps(ARMCC::AL));
+    MI->eraseFromParent();
+  };
+
+  auto RemoveActiveMask = [](MachineInstr *MI) {
+    MI->eraseFromParent();
+  };
+
+  // Combine the LoopDec and LoopEnd instructions into LE(TP).
+  auto ExpandLoopEnd = [this](MachineInstr *Dec, MachineInstr *End) {
+    // TODO: Check and handle the causes where LR is spilt between Dec and End.
+    MachineBasicBlock &MBB = *End->getParent();
+    MachineInstrBuilder MIB = BuildMI(MBB, End, End->getDebugLoc(),
+                                      TII->get(ARM::t2LETP));
+    MIB.addDef(ARM::LR);
+    unsigned OpIdx = 0;
+    MIB.add(End->getOperand(OpIdx++));
+    MIB.add(End->getOperand(OpIdx++));
+    MIB.add(predOps(ARMCC::AL));
+    LLVM_DEBUG(dbgs() << "ARM HWLOOPS: Inserted LETP: " << *MIB << "\n");
+    End->eraseFromParent();
+    Dec->eraseFromParent();
+  };
+
+  ExpandLoopStart(Start);
+  ExpandLoopEnd(Dec, End);
+
+  if (ActiveMask) {
+    for (auto *MI : Predicated) {
+      if (MI->mayLoad())
+        ExpandLoad(MI);
+      else if (MI->mayStore())
+        ExpandStore(MI);
+      else
+        llvm_unreachable("unhandled predicated instruction");
+    }
+    RemoveActiveMask(ActiveMask);
+  }
+}
+
+FunctionPass *llvm::createARMFinaliseHardwareLoopsPass() {
+  return new ARMFinaliseHWLoops();
+}
Index: lib/Target/ARM/ARMISelDAGToDAG.cpp
===================================================================
--- lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2959,6 +2959,19 @@
     // Other cases are autogenerated.
     break;
   }
+  case ARMISD::WhileLoopStart: {
+    SDValue Size = CurDAG->getTargetConstant(
+      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(), dl, MVT::i32);
+    SDValue Ops[] = { Size,
+                      N->getOperand(2),
+                      N->getOperand(3),
+                      N->getOperand(0) };
+    SDNode *LoopStart =
+      CurDAG->getMachineNode(ARM::t2LoopStart, dl, MVT::Other, Ops);
+    ReplaceUses(N, LoopStart);
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
   case ARMISD::BRCOND: {
     // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
     // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
@@ -2985,7 +2998,39 @@
 
     unsigned CC = (unsigned) cast<ConstantSDNode>(N2)->getZExtValue();
 
-    if (InFlag.getOpcode() == ARMISD::CMPZ) {
+    // Handle loops.
+    if (InFlag.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+      if (InFlag.getOpcode() == ARMISD::CMPZ) {
+        // Handle loops.
+        SDValue Int = InFlag.getOperand(0);
+        LLVM_DEBUG(dbgs() << "Int: "; Int.dump());
+        uint64_t ID = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
+
+        if (ID == Intrinsic::loop_dec) {
+          SDValue Elements = Int.getOperand(2);
+          SDValue Size = CurDAG->getTargetConstant(
+            cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl,
+                                 MVT::i32);
+
+          SDValue Args[] = { Elements, Size, Int.getOperand(0) };
+          SDNode *LoopDec =
+            CurDAG->getMachineNode(ARM::t2LoopDec, dl,
+                               CurDAG->getVTList(MVT::i32, MVT::Other),
+                               Args);
+          ReplaceUses(Int.getNode(), LoopDec);
+
+          SDValue EndArgs[] = { SDValue(LoopDec, 0), N1, Chain };
+          SDNode *LoopEnd =
+            CurDAG->getMachineNode(ARM::t2LoopEnd, dl, MVT::Other, EndArgs);
+
+          ReplaceUses(N, LoopEnd);
+          CurDAG->RemoveDeadNode(N);
+          CurDAG->RemoveDeadNode(InFlag.getNode());
+          CurDAG->RemoveDeadNode(Int.getNode());
+          return;
+        }
+      }
+
       bool SwitchEQNEToPLMI;
       SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI);
       InFlag = N->getOperand(4);
Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h
+++ lib/Target/ARM/ARMISelLowering.h
@@ -233,6 +233,8 @@
       // instructions.
       MEMCPY,
 
+      WhileLoopStart,
+
       // Vector load N-element structure to all lanes:
       VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
       VLD2DUP,
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -526,6 +526,10 @@
     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
   }
 
+  const MVT pTypes[] = { MVT::v16i1, MVT::v8i1, MVT::v4i1 };
+  for (auto VT : pTypes)
+    addRegisterClass(VT, &ARM::VCCRRegClass);
+
   for (MVT VT : MVT::vector_valuetypes()) {
     for (MVT InnerVT : MVT::vector_valuetypes()) {
       setTruncStoreAction(VT, InnerVT, Expand);
@@ -703,6 +707,7 @@
       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
     }
 
+    setTargetDAGCombine(ISD::BRCOND);
     setTargetDAGCombine(ISD::INTRINSIC_VOID);
     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
@@ -12670,6 +12675,31 @@
   return V;
 }
 
+static SDValue PerformHWLoopCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const ARMSubtarget *ST) {
+  SDValue CC = N->getOperand(1);
+
+  if (CC->getOperand(0)->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return SDValue();
+
+  SDValue Int = CC->getOperand(0);
+  unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue();
+  if (IntOp != Intrinsic::test_set_loop_elements)
+    return SDValue();
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Elements = Int.getOperand(2);
+  SDValue Size = Int.getOperand(3);
+  SDValue ExitBlock = N->getOperand(2);
+  SDLoc dl(Int);
+
+  SDValue Ops[] = { Chain, Size, Elements, ExitBlock };
+  SDValue Res = DCI.DAG.getNode(ARMISD::WhileLoopStart, dl, MVT::Other, Ops);
+  DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
+  return Res;
+}
+
 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
 SDValue
 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
@@ -12684,6 +12714,7 @@
   SDValue RHS = Cmp.getOperand(1);
   SDValue Chain = N->getOperand(0);
   SDValue BB = N->getOperand(1);
+
   SDValue ARMcc = N->getOperand(2);
   ARMCC::CondCodes CC =
     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
@@ -12901,6 +12932,7 @@
   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
+  case ISD::BRCOND:     return PerformHWLoopCombine(N, DCI, Subtarget);
   case ARMISD::ADDC:
   case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI, Subtarget);
   case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI, Subtarget);
Index: lib/Target/ARM/ARMInstrInfo.td
===================================================================
--- lib/Target/ARM/ARMInstrInfo.td
+++ lib/Target/ARM/ARMInstrInfo.td
@@ -13,6 +13,12 @@
 //===----------------------------------------------------------------------===//
 // ARM specific DAG Nodes.
 //
+def SDT_ARMWhileLoop : SDTypeProfile<0, 3, [SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>,
+                                            SDTCisVT<2, OtherVT>]>;
+
+def ARMWLS : SDNode<"ARMISD::WhileLoopStart", SDT_ARMWhileLoop,
+                    [SDNPHasChain]>;
 
 // Type profiles.
 def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
Index: lib/Target/ARM/ARMInstrThumb2.td
===================================================================
--- lib/Target/ARM/ARMInstrThumb2.td
+++ lib/Target/ARM/ARMInstrThumb2.td
@@ -1235,6 +1235,89 @@
                                 4, IIC_iALUi,
                                 []>, Sched<[WriteALU, ReadALU]>;
 
+let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in {
+  def t2LoopStart :
+    t2PseudoInst<(outs),
+                 (ins imm0_7:$size, rGPR:$elts, brtarget:$target),
+                 4, IIC_Br, []>, Sched<[WriteBr]>;
+  def t2WLSTP :
+    T2I<(outs GPRlr:$Rm), (ins imm0_7:$size, GPRlr:$elts, brtarget:$target), IIC_Br,
+        "wlstp.$size", "\t$Rm, $elts, $target", []>, Sched<[WriteBr]> {
+    bits<5> Rm;
+    bits<2> size;
+    bits<5> elts;
+    bits<12> target;
+  }
+}
+
+def t2LoopDec :
+  t2PseudoInst<(outs GPRlr:$Rm),
+               (ins GPRlr:$Rn, imm0_7:$size),
+               4, IIC_Br,
+               []>,
+               Sched<[WriteBr]>;
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in {
+  def t2LoopEnd :
+    t2PseudoInst<(outs),
+                 (ins GPRlr:$elts, brtarget:$target),
+                 4, IIC_Br, []>, Sched<[WriteBr]>;
+  def t2LETP :
+    T2I<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target), IIC_Br,
+        "letp", "\t$target", []>, Sched<[WriteBr]> {
+    bits<5> Rm;
+    bits<5> elts;
+    bits<12> target;
+  }
+}
+
+def t2ActiveMask :
+  t2PseudoInst<(outs VCCR:$pred),
+               (ins rGPR:$elts),
+               4, IIC_Br,
+               [(set VCCR:$pred, (int_get_active_mask_4 rGPR:$elts))]>,
+               Sched<[WriteBr]>;
+
+def nonext_masked_load :
+  PatFrag<(ops node:$ptr, node:$pred, node:$def),
+          (masked_load node:$ptr, node:$pred, node:$def), [{
+  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+def nontrunc_masked_store :
+  PatFrag<(ops node:$val, node:$ptr, node:$pred),
+          (masked_store node:$val, node:$ptr, node:$pred), [{
+  return !cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+
+def VMLDR32 : t2PseudoInst<(outs QPR:$vec),
+                         (ins t2addrmode_imm12:$addr, VCCR:$pred, i32imm:$imm), 4,
+                         NoItinerary, []>, Sched<[WriteLd]>;
+let mayLoad = 1 in
+def t2VLDRW : T2I<(outs QPR:$Rm),
+                  (ins rGPR:$addr), NoItinerary,
+                  "vldrw", "\t$Rm, [$addr]", []>, Sched<[WriteLd]> {
+  bits<6> Rm;
+  bits<5> addr;
+}
+
+def VMSTR32 : t2PseudoInst<(outs),
+                         (ins QPR:$vec, t2addrmode_imm12:$addr, VCCR:$pred, i32imm:$imm), 4,
+                         NoItinerary, []>, Sched<[WriteST]>;
+let mayStore = 1 in
+def t2VSTRW : T2I<(outs),
+                  (ins QPR:$Rm, rGPR:$addr), NoItinerary,
+                  "vstrw", "\t$Rm, [$addr]", []>, Sched<[WriteST]> {
+  bits<6> Rm;
+  bits<5> addr;
+}
+
+def : Pat<(v4i32 (nonext_masked_load rGPR:$addr, (v4i1 VCCR:$pred), undef)),
+          (v4i32 (VMLDR32 rGPR:$addr, (i32 0), (v4i1 VCCR:$pred), (i32 2)))>;
+def : Pat<(nontrunc_masked_store (v4i32 QPR:$vec), rGPR:$addr, (v4i1 VCCR:$pred)),
+          (VMSTR32 (v4i32 QPR:$vec), rGPR:$addr, (i32 0), (v4i1 VCCR:$pred),
+                   (i32 2))>;
+
+
 
 //===----------------------------------------------------------------------===//
 //  Load / store Instructions.
Index: lib/Target/ARM/ARMRegisterInfo.td
===================================================================
--- lib/Target/ARM/ARMRegisterInfo.td
+++ lib/Target/ARM/ARMRegisterInfo.td
@@ -254,6 +254,11 @@
   let DiagnosticString = "operand must be a register sp";
 }
 
+def GPRlr : RegisterClass<"ARM", [i32], 32, (add LR)>;
+
+def VPR  : ARMReg<32, "vpr">;
+def VCCR : RegisterClass<"ARM", [i32, v16i1, v8i1, v4i1], 32, (add VPR)>;
+
 // restricted GPR register class. Many Thumb2 instructions allow the full
 // register range for operands, but have undefined behaviours when PC
 // or SP (R13 or R15) are used. The ARM ISA refers to these operands
Index: lib/Target/ARM/ARMTargetMachine.cpp
===================================================================
--- lib/Target/ARM/ARMTargetMachine.cpp
+++ lib/Target/ARM/ARMTargetMachine.cpp
@@ -410,6 +410,9 @@
 
   TargetPassConfig::addIRPasses();
 
+  addPass(createHardwareLoops());
+  addPass(createDeadCodeEliminationPass());
+
   // Run the parallel DSP pass.
   if (getOptLevel() == CodeGenOpt::Aggressive) 
     addPass(createARMParallelDSPPass());
@@ -494,6 +497,8 @@
     addPass(createBreakFalseDeps());
   }
 
+  addPass(createARMFinaliseHardwareLoopsPass());
+
   // Expand some pseudo instructions into multiple instructions to allow
   // proper scheduling.
   addPass(createARMExpandPseudoPass());
Index: lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.h
+++ lib/Target/ARM/ARMTargetTransformInfo.h
@@ -180,6 +180,15 @@
                                  bool UseMaskForCond = false,
                                  bool UseMaskForGaps = false);
 
+  bool isLegalMaskedStore(Type *Ty) { return true; }
+
+  bool isLegalMaskedLoad(Type *Ty) { return true; }
+
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                TTI::HardwareLoopInfo &HWLoopInfo);
+
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
Index: lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -628,6 +628,70 @@
                                            UseMaskForCond, UseMaskForGaps);
 }
 
+bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                          AssumptionCache &AC,
+                                          TargetLibraryInfo *LibInfo,
+                                          TTI::HardwareLoopInfo &HWLoopInfo) {
+  if (!L->getExitBlock() || !SE.getBackedgeTakenCount(L))
+    return false;
+ 
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return false;
+
+  const SCEV *TripCountSCEV =
+    SE.getAddExpr(BackedgeTakenCount,
+                  SE.getOne(BackedgeTakenCount->getType()));
+
+  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
+    return false;
+
+  auto CheckForPredicates = [&HWLoopInfo](Loop *L) {
+    VectorType *VecTy = nullptr;
+    // Inspect the instructions for vector operations.
+    for (auto *BB : L->getBlocks()) {
+      for (auto &I : *BB) {
+        if (!isa<VectorType>(I.getType()))
+          continue;
+
+        auto *VTy = cast<VectorType>(I.getType());
+        if (!VecTy)
+          VecTy = VTy;
+        else if (VecTy->getNumElements() != VTy->getNumElements())
+          return false;
+
+        if (!isa<IntrinsicInst>(&I))
+          continue;
+
+        auto *Call = dyn_cast<IntrinsicInst>(&I);
+        if (Call->getIntrinsicID() != Intrinsic::masked_load &&
+            Call->getIntrinsicID() != Intrinsic::masked_store)
+          continue;
+
+        if (!HWLoopInfo.Predicate)
+          HWLoopInfo.Predicate = cast<Instruction>(Call->getOperand(2));
+        else if (HWLoopInfo.Predicate != cast<Instruction>(Call->getOperand(2)))
+          return false;
+      }
+    }
+    return true;
+  };
+
+  if (!CheckForPredicates(L))
+    return false;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (auto *BI = dyn_cast<BranchInst>(Preheader->getTerminator()))
+    if (BI->isUnconditional() && Preheader->getUniquePredecessor())
+      HWLoopInfo.PerformTest = true;
+
+  LLVMContext &C = L->getHeader()->getParent()->getParent()->getContext();
+  HWLoopInfo.InsertPHICounter = true;
+  HWLoopInfo.CountType = Type::getInt32Ty(C);
+  HWLoopInfo.NumElements = 4;
+  return true;
+}
+
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   // Only currently enable these preferences for M-Class cores.
Index: lib/Target/ARM/CMakeLists.txt
===================================================================
--- lib/Target/ARM/CMakeLists.txt
+++ lib/Target/ARM/CMakeLists.txt
@@ -29,6 +29,7 @@
   ARMConstantPoolValue.cpp
   ARMExpandPseudoInsts.cpp
   ARMFastISel.cpp
+  ARMFinalizeHardwareLoops.cpp
   ARMFrameLowering.cpp
   ARMHazardRecognizer.cpp
   ARMInstructionSelector.cpp
Index: lib/Target/PowerPC/PPCCTRLoops.cpp
===================================================================
--- lib/Target/PowerPC/PPCCTRLoops.cpp
+++ lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -71,63 +71,7 @@
 static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
 #endif
 
-// The latency of mtctr is only justified if there are more than 4
-// comparisons that will be removed as a result.
-static cl::opt<unsigned>
-SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
-                      cl::desc("Loops with a constant trip count smaller than "
-                               "this value will not use the count register."));
-
-STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
-
 namespace {
-  struct PPCCTRLoops : public FunctionPass {
-
-#ifndef NDEBUG
-    static int Counter;
-#endif
-
-  public:
-    static char ID;
-
-    PPCCTRLoops() : FunctionPass(ID) {
-      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<ScalarEvolutionWrapperPass>();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<TargetTransformInfoWrapperPass>();
-    }
-
-  private:
-    bool mightUseCTR(BasicBlock *BB);
-    bool convertToCTRLoop(Loop *L);
-
-  private:
-    const PPCTargetMachine *TM;
-    const PPCSubtarget *STI;
-    const PPCTargetLowering *TLI;
-    const DataLayout *DL;
-    const TargetLibraryInfo *LibInfo;
-    const TargetTransformInfo *TTI;
-    LoopInfo *LI;
-    ScalarEvolution *SE;
-    DominatorTree *DT;
-    bool PreserveLCSSA;
-    TargetSchedModel SchedModel;
-  };
-
-  char PPCCTRLoops::ID = 0;
-#ifndef NDEBUG
-  int PPCCTRLoops::Counter = 0;
-#endif
 
 #ifndef NDEBUG
   struct PPCCTRLoopsVerify : public MachineFunctionPass {
@@ -153,16 +97,6 @@
 #endif // NDEBUG
 } // end anonymous namespace
 
-INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
-                    false, false)
-
-FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); }
-
 #ifndef NDEBUG
 INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
                       "PowerPC CTR Loops Verify", false, false)
@@ -175,512 +109,6 @@
 }
 #endif // NDEBUG
 
-bool PPCCTRLoops::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
-  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-  if (!TPC)
-    return false;
-
-  TM = &TPC->getTM<PPCTargetMachine>();
-  STI = TM->getSubtargetImpl(F);
-  TLI = STI->getTargetLowering();
-
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  DL = &F.getParent()->getDataLayout();
-  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
-  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
-  SchedModel.init(STI);
-
-  bool MadeChange = false;
-
-  for (LoopInfo::iterator I = LI->begin(), E = LI->end();
-       I != E; ++I) {
-    Loop *L = *I;
-    if (!L->getParentLoop())
-      MadeChange |= convertToCTRLoop(L);
-  }
-
-  return MadeChange;
-}
-
-static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
-  if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
-    return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
-
-  return false;
-}
-
-// Determining the address of a TLS variable results in a function call in
-// certain TLS models.
-static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) {
-  const auto *GV = dyn_cast<GlobalValue>(MemAddr);
-  if (!GV) {
-    // Recurse to check for constants that refer to TLS global variables.
-    if (const auto *CV = dyn_cast<Constant>(MemAddr))
-      for (const auto &CO : CV->operands())
-        if (memAddrUsesCTR(TM, CO))
-          return true;
-
-    return false;
-  }
-
-  if (!GV->isThreadLocal())
-    return false;
-  TLSModel::Model Model = TM.getTLSModel(GV);
-  return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
-}
-
-// Loop through the inline asm constraints and look for something that clobbers
-// ctr.
-static bool asmClobbersCTR(InlineAsm *IA) {
-  InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
-  for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
-    InlineAsm::ConstraintInfo &C = CIV[i];
-    if (C.Type != InlineAsm::isInput)
-      for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
-        if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
-          return true;
-  }
-  return false;
-}
-
-bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
-  for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
-       J != JE; ++J) {
-    if (CallInst *CI = dyn_cast<CallInst>(J)) {
-      // Inline ASM is okay, unless it clobbers the ctr register.
-      if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
-        if (asmClobbersCTR(IA))
-          return true;
-        continue;
-      }
-
-      if (Function *F = CI->getCalledFunction()) {
-        // Most intrinsics don't become function calls, but some might.
-        // sin, cos, exp and log are always calls.
-        unsigned Opcode = 0;
-        if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
-          switch (F->getIntrinsicID()) {
-          default: continue;
-          // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
-          // we're definitely using CTR.
-          case Intrinsic::ppc_is_decremented_ctr_nonzero:
-          case Intrinsic::ppc_mtctr:
-            return true;
-
-// VisualStudio defines setjmp as _setjmp
-#if defined(_MSC_VER) && defined(setjmp) && \
-                       !defined(setjmp_undefined_for_msvc)
-#  pragma push_macro("setjmp")
-#  undef setjmp
-#  define setjmp_undefined_for_msvc
-#endif
-
-          case Intrinsic::setjmp:
-
-#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
- // let's return it to _setjmp state
-#  pragma pop_macro("setjmp")
-#  undef setjmp_undefined_for_msvc
-#endif
-
-          case Intrinsic::longjmp:
-
-          // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
-          // because, although it does clobber the counter register, the
-          // control can't then return to inside the loop unless there is also
-          // an eh_sjlj_setjmp.
-          case Intrinsic::eh_sjlj_setjmp:
-
-          case Intrinsic::memcpy:
-          case Intrinsic::memmove:
-          case Intrinsic::memset:
-          case Intrinsic::powi:
-          case Intrinsic::log:
-          case Intrinsic::log2:
-          case Intrinsic::log10:
-          case Intrinsic::exp:
-          case Intrinsic::exp2:
-          case Intrinsic::pow:
-          case Intrinsic::sin:
-          case Intrinsic::cos:
-            return true;
-          case Intrinsic::copysign:
-            if (CI->getArgOperand(0)->getType()->getScalarType()->
-                isPPC_FP128Ty())
-              return true;
-            else
-              continue; // ISD::FCOPYSIGN is never a library call.
-          case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
-          case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
-          case Intrinsic::ceil:               Opcode = ISD::FCEIL;      break;
-          case Intrinsic::trunc:              Opcode = ISD::FTRUNC;     break;
-          case Intrinsic::rint:               Opcode = ISD::FRINT;      break;
-          case Intrinsic::nearbyint:          Opcode = ISD::FNEARBYINT; break;
-          case Intrinsic::round:              Opcode = ISD::FROUND;     break;
-          case Intrinsic::minnum:             Opcode = ISD::FMINNUM;    break;
-          case Intrinsic::maxnum:             Opcode = ISD::FMAXNUM;    break;
-          case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO;      break;
-          case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO;      break;
-          }
-        }
-
-        // PowerPC does not use [US]DIVREM or other library calls for
-        // operations on regular types which are not otherwise library calls
-        // (i.e. soft float or atomics). If adapting for targets that do,
-        // additional care is required here.
-
-        LibFunc Func;
-        if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
-            LibInfo->getLibFunc(F->getName(), Func) &&
-            LibInfo->hasOptimizedCodeGen(Func)) {
-          // Non-read-only functions are never treated as intrinsics.
-          if (!CI->onlyReadsMemory())
-            return true;
-
-          // Conversion happens only for FP calls.
-          if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
-            return true;
-
-          switch (Func) {
-          default: return true;
-          case LibFunc_copysign:
-          case LibFunc_copysignf:
-            continue; // ISD::FCOPYSIGN is never a library call.
-          case LibFunc_copysignl:
-            return true;
-          case LibFunc_fabs:
-          case LibFunc_fabsf:
-          case LibFunc_fabsl:
-            continue; // ISD::FABS is never a library call.
-          case LibFunc_sqrt:
-          case LibFunc_sqrtf:
-          case LibFunc_sqrtl:
-            Opcode = ISD::FSQRT; break;
-          case LibFunc_floor:
-          case LibFunc_floorf:
-          case LibFunc_floorl:
-            Opcode = ISD::FFLOOR; break;
-          case LibFunc_nearbyint:
-          case LibFunc_nearbyintf:
-          case LibFunc_nearbyintl:
-            Opcode = ISD::FNEARBYINT; break;
-          case LibFunc_ceil:
-          case LibFunc_ceilf:
-          case LibFunc_ceill:
-            Opcode = ISD::FCEIL; break;
-          case LibFunc_rint:
-          case LibFunc_rintf:
-          case LibFunc_rintl:
-            Opcode = ISD::FRINT; break;
-          case LibFunc_round:
-          case LibFunc_roundf:
-          case LibFunc_roundl:
-            Opcode = ISD::FROUND; break;
-          case LibFunc_trunc:
-          case LibFunc_truncf:
-          case LibFunc_truncl:
-            Opcode = ISD::FTRUNC; break;
-          case LibFunc_fmin:
-          case LibFunc_fminf:
-          case LibFunc_fminl:
-            Opcode = ISD::FMINNUM; break;
-          case LibFunc_fmax:
-          case LibFunc_fmaxf:
-          case LibFunc_fmaxl:
-            Opcode = ISD::FMAXNUM; break;
-          }
-        }
-
-        if (Opcode) {
-          EVT EVTy =
-              TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true);
-
-          if (EVTy == MVT::Other)
-            return true;
-
-          if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
-            continue;
-          else if (EVTy.isVector() &&
-                   TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
-            continue;
-
-          return true;
-        }
-      }
-
-      return true;
-    } else if (isa<BinaryOperator>(J) &&
-               J->getType()->getScalarType()->isPPC_FP128Ty()) {
-      // Most operations on ppc_f128 values become calls.
-      return true;
-    } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
-               isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
-      CastInst *CI = cast<CastInst>(J);
-      if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
-          CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
-          isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) ||
-          isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType()))
-        return true;
-    } else if (isLargeIntegerTy(!TM->isPPC64(),
-                                J->getType()->getScalarType()) &&
-               (J->getOpcode() == Instruction::UDiv ||
-                J->getOpcode() == Instruction::SDiv ||
-                J->getOpcode() == Instruction::URem ||
-                J->getOpcode() == Instruction::SRem)) {
-      return true;
-    } else if (!TM->isPPC64() &&
-               isLargeIntegerTy(false, J->getType()->getScalarType()) &&
-               (J->getOpcode() == Instruction::Shl ||
-                J->getOpcode() == Instruction::AShr ||
-                J->getOpcode() == Instruction::LShr)) {
-      // Only on PPC32, for 128-bit integers (specifically not 64-bit
-      // integers), these might be runtime calls.
-      return true;
-    } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
-      // On PowerPC, indirect jumps use the counter register.
-      return true;
-    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
-      if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
-        return true;
-    }
-
-    // FREM is always a call.
-    if (J->getOpcode() == Instruction::FRem)
-      return true;
-
-    if (STI->useSoftFloat()) {
-      switch(J->getOpcode()) {
-      case Instruction::FAdd:
-      case Instruction::FSub:
-      case Instruction::FMul:
-      case Instruction::FDiv:
-      case Instruction::FPTrunc:
-      case Instruction::FPExt:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::UIToFP:
-      case Instruction::SIToFP:
-      case Instruction::FCmp:
-        return true;
-      }
-    }
-
-    for (Value *Operand : J->operands())
-      if (memAddrUsesCTR(*TM, Operand))
-        return true;
-  }
-
-  return false;
-}
-bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
-  bool MadeChange = false;
-
-  // Do not convert small short loops to CTR loop.
-  unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
-  if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
-    SmallPtrSet<const Value *, 32> EphValues;
-    auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
-        *L->getHeader()->getParent());
-    CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
-    CodeMetrics Metrics;
-    for (BasicBlock *BB : L->blocks())
-      Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
-    // 6 is an approximate latency for the mtctr instruction.
-    if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
-      return false;
-  }
-
-  // Process nested loops first.
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
-    MadeChange |= convertToCTRLoop(*I);
-    LLVM_DEBUG(dbgs() << "Nested loop converted\n");
-  }
-
-  // If a nested loop has been converted, then we can't convert this loop.
-  if (MadeChange)
-    return MadeChange;
-
-  // Bail out if the loop has irreducible control flow.
-  LoopBlocksRPO RPOT(L);
-  RPOT.perform(LI);
-  if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
-    return false;
-
-#ifndef NDEBUG
-  // Stop trying after reaching the limit (if any).
-  int Limit = CTRLoopLimit;
-  if (Limit >= 0) {
-    if (Counter >= CTRLoopLimit)
-      return false;
-    Counter++;
-  }
-#endif
-
-  // We don't want to spill/restore the counter register, and so we don't
-  // want to use the counter register if the loop contains calls.
-  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
-       I != IE; ++I)
-    if (mightUseCTR(*I))
-      return MadeChange;
-
-  SmallVector<BasicBlock*, 4> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
-
-  // If there is an exit edge known to be frequently taken,
-  // we should not transform this loop.
-  for (auto &BB : ExitingBlocks) {
-    Instruction *TI = BB->getTerminator();
-    if (!TI) continue;
-
-    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-      uint64_t TrueWeight = 0, FalseWeight = 0;
-      if (!BI->isConditional() ||
-          !BI->extractProfMetadata(TrueWeight, FalseWeight))
-        continue;
-
-      // If the exit path is more frequent than the loop path,
-      // we return here without further analysis for this loop.
-      bool TrueIsExit = !L->contains(BI->getSuccessor(0));
-      if (( TrueIsExit && FalseWeight < TrueWeight) ||
-          (!TrueIsExit && FalseWeight > TrueWeight))
-        return MadeChange;
-    }
-  }
-
-  BasicBlock *CountedExitBlock = nullptr;
-  const SCEV *ExitCount = nullptr;
-  BranchInst *CountedExitBranch = nullptr;
-  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
-       IE = ExitingBlocks.end(); I != IE; ++I) {
-    const SCEV *EC = SE->getExitCount(L, *I);
-    LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block "
-                      << (*I)->getName() << ": " << *EC << "\n");
-    if (isa<SCEVCouldNotCompute>(EC))
-      continue;
-    if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
-      if (ConstEC->getValue()->isZero())
-        continue;
-    } else if (!SE->isLoopInvariant(EC, L))
-      continue;
-
-    if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
-      continue;
-
-    // If this exiting block is contained in a nested loop, it is not eligible
-    // for insertion of the branch-and-decrement since the inner loop would
-    // end up messing up the value in the CTR.
-    if (LI->getLoopFor(*I) != L)
-      continue;
-
-    // We now have a loop-invariant count of loop iterations (which is not the
-    // constant zero) for which we know that this loop will not exit via this
-    // existing block.
-
-    // We need to make sure that this block will run on every loop iteration.
-    // For this to be true, we must dominate all blocks with backedges. Such
-    // blocks are in-loop predecessors to the header block.
-    bool NotAlways = false;
-    for (pred_iterator PI = pred_begin(L->getHeader()),
-         PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
-      if (!L->contains(*PI))
-        continue;
-
-      if (!DT->dominates(*I, *PI)) {
-        NotAlways = true;
-        break;
-      }
-    }
-
-    if (NotAlways)
-      continue;
-
-    // Make sure this blocks ends with a conditional branch.
-    Instruction *TI = (*I)->getTerminator();
-    if (!TI)
-      continue;
-
-    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-      if (!BI->isConditional())
-        continue;
-
-      CountedExitBranch = BI;
-    } else
-      continue;
-
-    // Note that this block may not be the loop latch block, even if the loop
-    // has a latch block.
-    CountedExitBlock = *I;
-    ExitCount = EC;
-    break;
-  }
-
-  if (!CountedExitBlock)
-    return MadeChange;
-
-  BasicBlock *Preheader = L->getLoopPreheader();
-
-  // If we don't have a preheader, then insert one. If we already have a
-  // preheader, then we can use it (except if the preheader contains a use of
-  // the CTR register because some such uses might be reordered by the
-  // selection DAG after the mtctr instruction).
-  if (!Preheader || mightUseCTR(Preheader))
-    Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
-  if (!Preheader)
-    return MadeChange;
-
-  LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName()
-                    << "\n");
-
-  // Insert the count into the preheader and replace the condition used by the
-  // selected branch.
-  MadeChange = true;
-
-  SCEVExpander SCEVE(*SE, *DL, "loopcnt");
-  LLVMContext &C = SE->getContext();
-  Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
-  if (!ExitCount->getType()->isPointerTy() &&
-      ExitCount->getType() != CountType)
-    ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
-  ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
-  Value *ECValue =
-      SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
-
-  IRBuilder<> CountBuilder(Preheader->getTerminator());
-  Module *M = Preheader->getParent()->getParent();
-  Function *MTCTRFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, CountType);
-  CountBuilder.CreateCall(MTCTRFunc, ECValue);
-
-  IRBuilder<> CondBuilder(CountedExitBranch);
-  Function *DecFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
-  Value *NewCond = CondBuilder.CreateCall(DecFunc, {});
-  Value *OldCond = CountedExitBranch->getCondition();
-  CountedExitBranch->setCondition(NewCond);
-
-  // The false branch must exit the loop.
-  if (!L->contains(CountedExitBranch->getSuccessor(0)))
-    CountedExitBranch->swapSuccessors();
-
-  // The old condition may be dead now, and may have even created a dead PHI
-  // (the original induction variable).
-  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
-  // Run through the basic blocks of the loop and see if any of them have dead
-  // PHIs that can be removed.
-  for (auto I : L->blocks())
-    DeleteDeadPHIs(I);
-
-  ++NumCTRLoops;
-  return MadeChange;
-}
-
 #ifndef NDEBUG
 static bool clobbersCTR(const MachineInstr &MI) {
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9764,11 +9764,11 @@
   }
   case ISD::INTRINSIC_W_CHAIN: {
     if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
-        Intrinsic::ppc_is_decremented_ctr_nonzero)
+        Intrinsic::loop_dec)
       break;
 
-    assert(N->getValueType(0) == MVT::i1 &&
-           "Unexpected result type for CTR decrement intrinsic");
+    //assert(N->getValueType(0) == MVT::i1 &&
+      //     "Unexpected result type for CTR decrement intrinsic");
     EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
                                  N->getValueType(0));
     SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
@@ -13454,10 +13454,12 @@
     SDValue Cond = N->getOperand(1);
     SDValue Target = N->getOperand(2);
 
-    if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
-        cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
-          Intrinsic::ppc_is_decremented_ctr_nonzero) {
+    if (Cond.getOpcode() == ISD::SETCC &&
+        Cond.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(Cond.getOperand(0).getOperand(1))->getZExtValue() ==
+          Intrinsic::loop_dec) {
 
+      Cond = Cond.getOperand(0);
       // We now need to make the intrinsic dead (it cannot be instruction
       // selected).
       DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
@@ -13482,14 +13484,14 @@
     if (LHS.getOpcode() == ISD::AND &&
         LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
         cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
-          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+          Intrinsic::loop_dec &&
         isa<ConstantSDNode>(LHS.getOperand(1)) &&
         !isNullConstant(LHS.getOperand(1)))
       LHS = LHS.getOperand(0);
 
     if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
         cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
-          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+          Intrinsic::loop_dec &&
         isa<ConstantSDNode>(RHS)) {
       assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
              "Counter decrement comparison is not EQ or NE");
Index: lib/Target/PowerPC/PPCInstr64Bit.td
===================================================================
--- lib/Target/PowerPC/PPCInstr64Bit.td
+++ lib/Target/PowerPC/PPCInstr64Bit.td
@@ -382,7 +382,7 @@
              PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 let hasSideEffects = 1, Defs = [CTR8] in {
-let Pattern = [(int_ppc_mtctr i64:$rS)] in
+let Pattern = [(int_set_loop_iterations i64:$rS)] in
 def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
                                "mtctr $rS", IIC_SprMTSPR>,
                  PPC970_DGroup_First, PPC970_Unit_FXU;
Index: lib/Target/PowerPC/PPCInstrInfo.td
===================================================================
--- lib/Target/PowerPC/PPCInstrInfo.td
+++ lib/Target/PowerPC/PPCInstrInfo.td
@@ -2600,7 +2600,7 @@
             PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
-let Pattern = [(int_ppc_mtctr i32:$rS)] in
+let Pattern = [(int_set_loop_iterations i32:$rS)] in
 def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
                               "mtctr $rS", IIC_SprMTSPR>,
                 PPC970_DGroup_First, PPC970_Unit_FXU;
Index: lib/Target/PowerPC/PPCTargetMachine.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetMachine.cpp
+++ lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -101,7 +101,6 @@
   RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
-  initializePPCCTRLoopsPass(PR);
 #ifndef NDEBUG
   initializePPCCTRLoopsVerifyPass(PR);
 #endif
@@ -422,7 +421,7 @@
     addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
 
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
-    addPass(createPPCCTRLoops());
+    addPass(createHardwareLoops());
 
   return false;
 }
Index: lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -53,6 +53,11 @@
   unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+  bool mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo);
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                TTI::HardwareLoopInfo &HWLoopInfo);
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -7,10 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCTargetTransformInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 using namespace llvm;
@@ -31,6 +33,13 @@
                 cl::desc("Enable using coldcc calling conv for cold "
                          "internal functions"));
 
+// The latency of mtctr is only justified if there are more than 4
+// comparisons that will be removed as a result.
+static cl::opt<unsigned>
+SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+                      cl::desc("Loops with a constant trip count smaller than "
+                               "this value will not use the count register."));
+
 //===----------------------------------------------------------------------===//
 //
 // PPC cost model.
@@ -204,6 +213,341 @@
   return BaseT::getUserCost(U, Operands);
 }
 
+bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
+                             TargetLibraryInfo *LibInfo) {
+  const PPCTargetMachine &TM = ST->getTargetMachine();
+
+  // Loop through the inline asm constraints and look for something that
+  // clobbers ctr.
+  auto asmClobbersCTR = [](InlineAsm *IA) {
+    InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+    for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+      InlineAsm::ConstraintInfo &C = CIV[i];
+      if (C.Type != InlineAsm::isInput)
+        for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+          if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+            return true;
+    }
+    return false;
+  };
+
+  // Determining the address of a TLS variable results in a function call in
+  // certain TLS models.
+  std::function<bool(const Value*)> memAddrUsesCTR =
+    [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool {
+    const auto *GV = dyn_cast<GlobalValue>(MemAddr);
+    if (!GV) {
+      // Recurse to check for constants that refer to TLS global variables.
+      if (const auto *CV = dyn_cast<Constant>(MemAddr))
+        for (const auto &CO : CV->operands())
+          if (memAddrUsesCTR(CO))
+            return true;
+
+      return false;
+    }
+
+    if (!GV->isThreadLocal())
+      return false;
+    TLSModel::Model Model = TM.getTLSModel(GV);
+    return Model == TLSModel::GeneralDynamic ||
+      Model == TLSModel::LocalDynamic;
+  };
+
+  auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) {
+    if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+      return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
+
+    return false;
+  };
+
+  for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
+       J != JE; ++J) {
+    if (CallInst *CI = dyn_cast<CallInst>(J)) {
+      // Inline ASM is okay, unless it clobbers the ctr register.
+      if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+        if (asmClobbersCTR(IA))
+          return true;
+        continue;
+      }
+
+      if (Function *F = CI->getCalledFunction()) {
+        // Most intrinsics don't become function calls, but some might.
+        // sin, cos, exp and log are always calls.
+        unsigned Opcode = 0;
+        if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
+          switch (F->getIntrinsicID()) {
+          default: continue;
+          // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
+          // we're definitely using CTR.
+          case Intrinsic::set_loop_iterations:
+          case Intrinsic::loop_dec:
+            return true;
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+                       !defined(setjmp_undefined_for_msvc)
+#  pragma push_macro("setjmp")
+#  undef setjmp
+#  define setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::setjmp:
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
+ // let's return it to _setjmp state
+#  pragma pop_macro("setjmp")
+#  undef setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::longjmp:
+
+          // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
+          // because, although it does clobber the counter register, the
+          // control can't then return to inside the loop unless there is also
+          // an eh_sjlj_setjmp.
+          case Intrinsic::eh_sjlj_setjmp:
+
+          case Intrinsic::memcpy:
+          case Intrinsic::memmove:
+          case Intrinsic::memset:
+          case Intrinsic::powi:
+          case Intrinsic::log:
+          case Intrinsic::log2:
+          case Intrinsic::log10:
+          case Intrinsic::exp:
+          case Intrinsic::exp2:
+          case Intrinsic::pow:
+          case Intrinsic::sin:
+          case Intrinsic::cos:
+            return true;
+          case Intrinsic::copysign:
+            if (CI->getArgOperand(0)->getType()->getScalarType()->
+                isPPC_FP128Ty())
+              return true;
+            else
+              continue; // ISD::FCOPYSIGN is never a library call.
+          case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
+          case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
+          case Intrinsic::ceil:               Opcode = ISD::FCEIL;      break;
+          case Intrinsic::trunc:              Opcode = ISD::FTRUNC;     break;
+          case Intrinsic::rint:               Opcode = ISD::FRINT;      break;
+          case Intrinsic::nearbyint:          Opcode = ISD::FNEARBYINT; break;
+          case Intrinsic::round:              Opcode = ISD::FROUND;     break;
+          case Intrinsic::minnum:             Opcode = ISD::FMINNUM;    break;
+          case Intrinsic::maxnum:             Opcode = ISD::FMAXNUM;    break;
+          case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO;      break;
+          case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO;      break;
+          }
+        }
+
+        // PowerPC does not use [US]DIVREM or other library calls for
+        // operations on regular types which are not otherwise library calls
+        // (i.e. soft float or atomics). If adapting for targets that do,
+        // additional care is required here.
+
+        LibFunc Func;
+        if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
+            LibInfo->getLibFunc(F->getName(), Func) &&
+            LibInfo->hasOptimizedCodeGen(Func)) {
+          // Non-read-only functions are never treated as intrinsics.
+          if (!CI->onlyReadsMemory())
+            return true;
+
+          // Conversion happens only for FP calls.
+          if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
+            return true;
+
+          switch (Func) {
+          default: return true;
+          case LibFunc_copysign:
+          case LibFunc_copysignf:
+            continue; // ISD::FCOPYSIGN is never a library call.
+          case LibFunc_copysignl:
+            return true;
+          case LibFunc_fabs:
+          case LibFunc_fabsf:
+          case LibFunc_fabsl:
+            continue; // ISD::FABS is never a library call.
+          case LibFunc_sqrt:
+          case LibFunc_sqrtf:
+          case LibFunc_sqrtl:
+            Opcode = ISD::FSQRT; break;
+          case LibFunc_floor:
+          case LibFunc_floorf:
+          case LibFunc_floorl:
+            Opcode = ISD::FFLOOR; break;
+          case LibFunc_nearbyint:
+          case LibFunc_nearbyintf:
+          case LibFunc_nearbyintl:
+            Opcode = ISD::FNEARBYINT; break;
+          case LibFunc_ceil:
+          case LibFunc_ceilf:
+          case LibFunc_ceill:
+            Opcode = ISD::FCEIL; break;
+          case LibFunc_rint:
+          case LibFunc_rintf:
+          case LibFunc_rintl:
+            Opcode = ISD::FRINT; break;
+          case LibFunc_round:
+          case LibFunc_roundf:
+          case LibFunc_roundl:
+            Opcode = ISD::FROUND; break;
+          case LibFunc_trunc:
+          case LibFunc_truncf:
+          case LibFunc_truncl:
+            Opcode = ISD::FTRUNC; break;
+          case LibFunc_fmin:
+          case LibFunc_fminf:
+          case LibFunc_fminl:
+            Opcode = ISD::FMINNUM; break;
+          case LibFunc_fmax:
+          case LibFunc_fmaxf:
+          case LibFunc_fmaxl:
+            Opcode = ISD::FMAXNUM; break;
+          }
+        }
+
+        if (Opcode) {
+          EVT EVTy =
+              TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true);
+
+          if (EVTy == MVT::Other)
+            return true;
+
+          if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
+            continue;
+          else if (EVTy.isVector() &&
+                   TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
+            continue;
+
+          return true;
+        }
+      }
+
+      return true;
+    } else if (isa<BinaryOperator>(J) &&
+               J->getType()->getScalarType()->isPPC_FP128Ty()) {
+      // Most operations on ppc_f128 values become calls.
+      return true;
+    } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
+               isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
+      CastInst *CI = cast<CastInst>(J);
+      if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
+          CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
+          isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) ||
+          isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType()))
+        return true;
+    } else if (isLargeIntegerTy(!TM.isPPC64(),
+                                J->getType()->getScalarType()) &&
+               (J->getOpcode() == Instruction::UDiv ||
+                J->getOpcode() == Instruction::SDiv ||
+                J->getOpcode() == Instruction::URem ||
+                J->getOpcode() == Instruction::SRem)) {
+      return true;
+    } else if (!TM.isPPC64() &&
+               isLargeIntegerTy(false, J->getType()->getScalarType()) &&
+               (J->getOpcode() == Instruction::Shl ||
+                J->getOpcode() == Instruction::AShr ||
+                J->getOpcode() == Instruction::LShr)) {
+      // Only on PPC32, for 128-bit integers (specifically not 64-bit
+      // integers), these might be runtime calls.
+      return true;
+    } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
+      // On PowerPC, indirect jumps use the counter register.
+      return true;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
+      if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
+        return true;
+    }
+
+    // FREM is always a call.
+    if (J->getOpcode() == Instruction::FRem)
+      return true;
+
+    if (ST->useSoftFloat()) {
+      switch(J->getOpcode()) {
+      case Instruction::FAdd:
+      case Instruction::FSub:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FPTrunc:
+      case Instruction::FPExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::UIToFP:
+      case Instruction::SIToFP:
+      case Instruction::FCmp:
+        return true;
+      }
+    }
+
+    for (Value *Operand : J->operands())
+      if (memAddrUsesCTR(Operand))
+        return true;
+  }
+
+  return false;
+}
+
+bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                          AssumptionCache &AC,
+                                          TargetLibraryInfo *LibInfo,
+                                          TTI::HardwareLoopInfo &HWLoopInfo) {
+  const PPCTargetMachine &TM = ST->getTargetMachine();
+  TargetSchedModel SchedModel;
+  SchedModel.init(ST);
+
+  // Do not convert small short loops to CTR loop.
+  unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
+  if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+    CodeMetrics Metrics;
+    for (BasicBlock *BB : L->blocks())
+      Metrics.analyzeBasicBlock(BB, *this, EphValues);
+    // 6 is an approximate latency for the mtctr instruction.
+    if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
+      return false;
+  }
+
+  // We don't want to spill/restore the counter register, and so we don't
+  // want to use the counter register if the loop contains calls.
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I)
+    if (mightUseCTR(*I, LibInfo))
+      return false;
+
+  SmallVector<BasicBlock*, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  // If there is an exit edge known to be frequently taken,
+  // we should not transform this loop.
+  for (auto &BB : ExitingBlocks) {
+    Instruction *TI = BB->getTerminator();
+    if (!TI) continue;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      uint64_t TrueWeight = 0, FalseWeight = 0;
+      if (!BI->isConditional() ||
+          !BI->extractProfMetadata(TrueWeight, FalseWeight))
+        continue;
+
+      // If the exit path is more frequent than the loop path,
+      // we return here without further analysis for this loop.
+      bool TrueIsExit = !L->contains(BI->getSuccessor(0));
+      if (( TrueIsExit && FalseWeight < TrueWeight) ||
+          (!TrueIsExit && FalseWeight > TrueWeight))
+        return false;
+    }
+  }
+
+  LLVMContext &C = L->getHeader()->getParent()->getParent()->getContext();
+  HWLoopInfo.CountType = TM.isPPC64() ?
+    Type::getInt64Ty(C) : Type::getInt32Ty(C);
+
+  return true;
+}
+
 void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   if (ST->getDarwinDirective() == PPC::DIR_A2) {
Index: test/CodeGen/PowerPC/ctrloop-intrin.ll
===================================================================
--- test/CodeGen/PowerPC/ctrloop-intrin.ll
+++ test/CodeGen/PowerPC/ctrloop-intrin.ll
@@ -263,7 +263,7 @@
   %8 = sub i64 0, %int_part_ptr.02534
   %scevgep5 = getelementptr i8, i8* %call109, i64 %8
   %scevgep56 = ptrtoint i8* %scevgep5 to i64
-  call void @llvm.ppc.mtctr.i64(i64 %scevgep56)
+  call void @llvm.set.loop.iterations.i64(i64 %scevgep56)
   br label %for.body.116
 
 for.cond.cleanup:                                 ; preds = %if.end.138, %if.end.105
@@ -298,8 +298,9 @@
   %conv134 = trunc i32 %add133 to i8
   %scevgep = getelementptr i8, i8* inttoptr (i64 -1 to i8*), i64 %call109.pn2
   store i8 %conv134, i8* %scevgep, align 1, !tbaa !10
-  %12 = call i1 @llvm.ppc.is.decremented.ctr.nonzero()
-  br i1 %12, label %for.body.116, label %for.cond.cleanup.115
+  %12 = call i64 @llvm.loop.dec(i64 %scevgep56, i64 1)
+  %dec.cmp = icmp ne i64 %12, 0
+  br i1 %dec.cmp, label %for.body.116, label %for.cond.cleanup.115
 
 if.then.136:                                      ; preds = %for.cond.cleanup.115
   %incdec.ptr137 = getelementptr inbounds i8, i8* %int_part_ptr.0253, i64 -1
@@ -323,10 +324,10 @@
 declare i8* @memcpy(i8*, i8* nocapture readonly, i64) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.ppc.mtctr.i64(i64) #0
+declare void @llvm.set.loop.iterations.i64(i64) #0
 
 ; Function Attrs: nounwind
-declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #0
+declare i64 @llvm.loop.dec(i64, i64) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/CodeGen/PowerPC/ppc-passname.ll
===================================================================
--- test/CodeGen/PowerPC/ppc-passname.ll
+++ test/CodeGen/PowerPC/ppc-passname.ll
@@ -1,15 +1,3 @@
-; Test pass name: ppc-ctr-loops.
-; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-CTR-LOOPS
-; STOP-BEFORE-CTR-LOOPS-NOT: -ppc-ctr-loops
-; STOP-BEFORE-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered.
-; STOP-BEFORE-CTR-LOOPS-NOT: PowerPC CTR Loops 
-
-; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-CTR-LOOPS
-; STOP-AFTER-CTR-LOOPS: -ppc-ctr-loops
-; STOP-AFTER-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered.
-; STOP-AFTER-CTR-LOOPS: PowerPC CTR Loops 
-
-
 ; Test pass name: ppc-loop-preinc-prep.
 ; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-loop-preinc-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-LOOP-PREINC-PREP
 ; STOP-BEFORE-LOOP-PREINC-PREP-NOT: -ppc-loop-preinc-prep
Index: test/CodeGen/Thumb2/mve-tailpred.ll
===================================================================
--- /dev/null
+++ test/CodeGen/Thumb2/mve-tailpred.ll
@@ -0,0 +1,78 @@
+; RUN: opt -mtriple=thumbv8 -mcpu=cortex-a72 %s -arm-hardware-loops -dce -S -o - | FileCheck %s --check-prefix=OPT
+; RUN: llc -mtriple=thumbv8 -mcpu=cortex-a72 %s -S -o - | FileCheck %s --check-prefix=LLC
+
+; CHECK-OPT-LABEL: mul_N
+; CHECK-OPT: %0 = call i32 @llvm.arm.while.setup(i32 %N, i32 4)
+; CHECK-OPT: br i1 %1, label %vector.ph, label %for.cond.cleanup
+
+; CHECK-OPT: vector.ph:
+; CHECK-OPT: br label %vector.body
+
+; CHECK-OPT: vecctor.body:
+; CHECK-OPT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-OPT: %2 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ]
+; CHECK-OPT: %3 = getelementptr inbounds i32, i32* %a, i32 %index
+; CHECK-OPT: %4 = call <4 x i1> @llvm.arm.get.active.mask.4(i32 %2
+; CHECK-OPT: %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %5, i32 4, <4 x i1> %4, <4 x i32> undef)
+; CHECK-OPT: %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %7, i32 4, <4 x i1> %4, <4 x i32> undef)
+; CHECK-OPT: %8 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+; CHECK-OPT: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %4)
+; CHECK-OPT: %index.next = add i32 %index, 4
+; CHECK-OPT: %11 = call i32 @llvm.arm.loop.end(i32 %2, i32 4)
+; CHECk-OPT: %12 = icmp ne i32 %11, 0
+; CHECK-OPT: br i1 %12, label %vector.body, label %for.cond.cleanup
+
+; CHECK-LLC-LABEL: mul_N
+; CHECK-LLC:: wlstp.#4 lr, r3, .LBB0_3
+; CHECK-LLC: .LBB0_2:
+; CHECK-LLC: vldrw      q8, [r0]
+; CHECK-LLC: vldrw      q9, [r1]
+; CHECK-LLC: adds       r0, #16
+; CHECK-LLC: adds       r1, #16
+; CHECK-LLC: adds       r3, #4
+; CHECK-LLC: vmul.i32   q8, q9, q8
+; CHECK-LLC: vstrw      q8, [r2]
+; CHECK-LLC: adds       r2, #16
+; CHECK-LLC: letp  .LBB0_2
+; CHECK-LLC: b .LBB0_3
+
+define dso_local arm_aapcs_vfpcc void @mul_N(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i32, i32* %a, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
+  %2 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %3 = getelementptr inbounds i32, i32* %b, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+  %6 = getelementptr inbounds i32, i32* %c, i32 %index
+  %7 = bitcast i32* %6 to <4 x i32>*
+  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %7, i32 4, <4 x i1> %1)
+  %index.next = add i32 %index, 4
+  %8 = icmp eq i32 %index.next, %n.vec
+  br i1 %8, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)