Index: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
+++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -180,6 +180,12 @@
                                  bool UseMaskForCond = false,
                                  bool UseMaskForGaps = false);
 
+  bool isLoweredToCall(const Function *F);
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                TTI::HardwareLoopInfo &HWLoopInfo);
+
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
Index: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -36,6 +36,10 @@
 
 #define DEBUG_TYPE "armtti"
 
+static cl::opt<bool> DisableLowOverheadLoops(
+  "disable-arm-loloops", cl::Hidden, cl::init(true),
+  cl::desc("Disable the generation of low-overhead loops"));
+
 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                      const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -628,6 +632,196 @@
                                            UseMaskForCond, UseMaskForGaps);
 }
 
+bool ARMTTIImpl::isLoweredToCall(const Function *F) {
+  if (!F->isIntrinsic())
+    BaseT::isLoweredToCall(F);
+
+  // Assume all Arm-specific intrinsics map to an instruction.
+  if (F->getName().startswith("llvm.arm"))
+    return false;
+
+  switch (F->getIntrinsicID()) {
+  default: break;
+  case Intrinsic::powi:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::pow:
+  case Intrinsic::log:
+  case Intrinsic::log10:
+  case Intrinsic::log2:
+  case Intrinsic::exp:
+  case Intrinsic::exp2:
+    return true;
+  case Intrinsic::sqrt:
+  case Intrinsic::fabs:
+  case Intrinsic::copysign:
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
+  case Intrinsic::round:
+  case Intrinsic::canonicalize:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
+  case Intrinsic::lrint:
+  case Intrinsic::llrint:
+    if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
+      return true;
+    if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
+      return true;
+    // Some operations can be handled by vector instructions and assume
+    // unsupported vectors will be expanded into supported scalar ones.
+    // TODO Handle scalar operations properly.
+    return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
+  case Intrinsic::masked_store:
+  case Intrinsic::masked_load:
+  case Intrinsic::masked_gather:
+  case Intrinsic::masked_scatter:
+    return !ST->hasMVEIntegerOps();
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::sadd_sat:
+  case Intrinsic::uadd_sat:
+  case Intrinsic::ssub_sat:
+  case Intrinsic::usub_sat:
+    return false;
+  }
+
+  return BaseT::isLoweredToCall(F);
+}
+
+bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                          AssumptionCache &AC,
+                                          TargetLibraryInfo *LibInfo,
+                                          TTI::HardwareLoopInfo &HWLoopInfo) {
+  // Low-overhead branches are only supported in the 'low-overhead branch'
+  // extension of v8.1-m.
+  if (!ST->hasLOB() || DisableLowOverheadLoops)
+    return false;
+
+  // For now, for simplicity, only support loops with one exit block.
+  if (!L->getExitBlock())
+    return false;
+
+  if (!SE.hasLoopInvariantBackedgeTakenCount(L))
+    return false;
+
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return false;
+
+  const SCEV *TripCountSCEV =
+    SE.getAddExpr(BackedgeTakenCount,
+                  SE.getOne(BackedgeTakenCount->getType()));
+
+  // We need to store the trip count in LR, a 32-bit register.
+  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
+    return false;
+
+  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
+  // point in generating a hardware loop if that's going to happen.
+  auto MaybeCall = [this](Instruction &I) {
+    const ARMTargetLowering *TLI = getTLI();
+    unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
+    EVT VT = TLI->getValueType(DL, I.getType(), true);
+    if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
+      return true;
+
+    // Check if an intrinsic will be lowered to a call and assume that any
+    // other CallInst will generate a bl.
+    if (auto *Call = dyn_cast<CallInst>(&I)) {
+      if (isa<IntrinsicInst>(Call)) {
+        if (const Function *F = Call->getCalledFunction())
+          return isLoweredToCall(F);
+      }
+      return true;
+    }
+
+    // FPv5 provides conversions between integer, double-precision,
+    // single-precision, and half-precision formats.
+    switch (I.getOpcode()) {
+    default:
+      break;
+    case Instruction::FPToSI:
+    case Instruction::FPToUI:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+      return !ST->hasFPARMv8Base();
+    }
+
+    // FIXME: Unfortunately the approach of checking the Operation Action does
+    // not catch all cases of Legalization that use library calls. Our
+    // Legalization step categorizes some transformations into library calls as
+    // Custom, Expand or even Legal when doing type legalization. So for now
+    // we have to special case for instance the SDIV of 64bit integers and the
+    // use of floating point emulation.
+    if (VT.isInteger() && VT.getSizeInBits() >= 64) {
+      switch (ISD) {
+      default:
+        break;
+      case ISD::SDIV:
+      case ISD::UDIV:
+      case ISD::SREM:
+      case ISD::UREM:
+      case ISD::SDIVREM:
+      case ISD::UDIVREM:
+        return true;
+      }
+    }
+
+    // Assume all other non-float operations are supported.
+    if (!VT.isFloatingPoint())
+      return false;
+
+    // We'll need a library call to handle most floats when using soft.
+    if (TLI->useSoftFloat()) {
+      switch (I.getOpcode()) {
+      default:
+        return true;
+      case Instruction::Alloca:
+      case Instruction::Load:
+      case Instruction::Store:
+      case Instruction::Select:
+      case Instruction::PHI:
+        return false;
+      }
+    }
+
+    // We'll need a libcall to perform double precision operations on a single
+    // precision only FPU.
+    if (I.getType()->isDoubleTy() && !ST->hasFP64())
+      return true;
+
+    // Likewise for half precision arithmetic.
+    if (I.getType()->isHalfTy() && !ST->hasFullFP16())
+      return true;
+
+    return false;
+  };
+
+  // Scan the instructions to see if there's any that we know will turn into a
+  // call.
+  for (auto *BB : L->getBlocks())
+    for (auto &I : *BB)
+      if (MaybeCall(I))
+        return false;
+
+  // TODO: Check whether the trip count calculation is expensive. If L is the
+  // inner loop but we know it has a low trip count, calculating that trip
+  // count (in the parent loop) may be detrimental.
+
+  LLVMContext &C = L->getHeader()->getContext();
+  HWLoopInfo.CounterInReg = true;
+  HWLoopInfo.CountType = Type::getInt32Ty(C);
+  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
+  return true;
+}
+
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   // Only currently enable these preferences for M-Class cores.
Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/calls.ll
===================================================================
--- llvm/trunk/test/Transforms/HardwareLoops/ARM/calls.ll
+++ llvm/trunk/test/Transforms/HardwareLoops/ARM/calls.ll
@@ -0,0 +1,404 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MAIN
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8,+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP64
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP
+
+
+; CHECK-LABEL: skip_call
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call i32 @llvm.loop.decrement
+
+define i32 @skip_call(i32 %n) {
+entry:
+  %cmp6 = icmp eq i32 %n, 0
+  br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.08 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #2
+  %add = add nsw i32 %call, %res.07
+  %inc1 = add nuw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: test_target_specific
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 50)
+; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ 50, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %loop, label %exit
+
+define i32 @test_target_specific(i32* %a, i32* %b) {
+entry:
+  br label %loop
+loop:
+  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr i32, i32* %a, i32 %count
+  %addr.b = getelementptr i32, i32* %b, i32 %count
+  %load.a = load i32, i32* %addr.a
+  %load.b = load i32, i32* %addr.b
+  %res = call i32 @llvm.arm.smlad(i32 %load.a, i32 %load.b, i32 %acc)
+  %count.next = add nuw i32 %count, 2
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret i32 %res
+}
+
+; CHECK-LABEL: test_fabs_f16
+; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
+; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
+define void @test_fabs_f16(half* %a, half* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr half, half* %a, i32 %count
+  %load.a = load half, half* %addr.a
+  %abs = call half @llvm.fabs.f16(half %load.a)
+  %addr.b = getelementptr half, half* %b, i32 %count
+  store half %abs, half *%addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_fabs
+; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
+; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
+define float @test_fabs(float* %a) {
+entry:
+  br label %loop
+loop:
+  %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr float, float* %a, i32 %count
+  %load.a = load float, float* %addr.a
+  %abs = call float @llvm.fabs.f32(float %load.a)
+  %res = fadd float %abs, %acc
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %res
+}
+
+; CHECK-LABEL: test_fabs_64
+; CHECK-MAIN-NOT:   call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:    call void @llvm.set.loop.iterations
+; CHECK-FP-NOT:     call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-FP64:       void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP-NOT:  call void @llvm.set.loop.iterations.i32(i32 100)
+define void @test_fabs_64(double* %a, double* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr double, double* %a, i32 %count
+  %load.a = load double, double* %addr.a
+  %abs = call double @llvm.fabs.f64(double %load.a)
+  %addr.b = getelementptr double, double* %b, i32 %count
+  store double %abs, double *%addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_fabs_vec
+; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
+; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
+define <4 x float> @test_fabs_vec(<4 x float>* %a) {
+entry:
+  br label %loop
+loop:
+  %acc = phi <4 x float> [ zeroinitializer, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
+  %load.a = load <4 x float>, <4 x float>* %addr.a
+  %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %load.a)
+  %res = fadd <4 x float> %abs, %acc
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_log
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: llvm.loop.decrement
+define float @test_log(float* %a) {
+entry:
+  br label %loop
+loop:
+  %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr float, float* %a, i32 %count
+  %load.a = load float, float* %addr.a
+  %abs = call float @llvm.log.f32(float %load.a)
+  %res = fadd float %abs, %acc
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %res
+}
+
+; CHECK-LABEL: test_sqrt_16
+; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
+; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-FP64:     call void @llvm.set.loop.iterations.i32(i32 100)
+define void @test_sqrt_16(half* %a, half* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr half, half* %a, i32 %count
+  %load.a = load half, half* %addr.a
+  %sqrt = call half @llvm.sqrt.f16(half %load.a)
+  %addr.b = getelementptr half, half* %b, i32 %count
+  store half %sqrt, half *%addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+; CHECK-LABEL: test_sqrt
+; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
+; CHECK-FP: call void @llvm.set.loop.iterations
+; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
+define void @test_sqrt(float* %a, float* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr float, float* %a, i32 %count
+  %load.a = load float, float* %addr.a
+  %sqrt = call float @llvm.sqrt.f32(float %load.a)
+  %addr.b = getelementptr float, float* %b, i32 %count
+  store float %sqrt, float* %addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_sqrt_64
+; CHECK-MAIN-NOT:   call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:    call void @llvm.set.loop.iterations
+; CHECK-FP-NOT:     call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP-NOT:  call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-FP64:       call void @llvm.set.loop.iterations.i32(i32 100)
+define void @test_sqrt_64(double* %a, double* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr double, double* %a, i32 %count
+  %load.a = load double, double* %addr.a
+  %sqrt = call double @llvm.sqrt.f64(double %load.a)
+  %addr.b = getelementptr double, double* %b, i32 %count
+  store double %sqrt, double *%addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_sqrt_vec
+; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
+; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
+define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
+  %load.a = load <4 x float>, <4 x float>* %addr.a
+  %sqrt = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %load.a)
+  %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
+  store <4 x float> %sqrt, <4 x float>* %addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_overflow
+; CHECK: call void @llvm.set.loop.iterations
+define i32 @test_overflow(i32* %a, i32* %b) {
+entry:
+  br label %loop
+loop:
+  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr i32, i32* %a, i32 %count
+  %addr.b = getelementptr i32, i32* %b, i32 %count
+  %load.a = load i32, i32* %addr.a
+  %load.b = load i32, i32* %addr.b
+  %sadd = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %load.a, i32 %load.b)
+  %res = extractvalue {i32, i1} %sadd, 0
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret i32 %res
+}
+
+; TODO: We should be able to generate a qadd/sub
+; CHECK-LABEL: test_sat
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 100)
+define i32 @test_sat(i32* %a, i32* %b) {
+entry:
+  br label %loop
+loop:
+  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr i32, i32* %a, i32 %count
+  %addr.b = getelementptr i32, i32* %b, i32 %count
+  %load.a = load i32, i32* %addr.a
+  %load.b = load i32, i32* %addr.b
+  %res = call i32 @llvm.sadd.sat.i32(i32 %load.a, i32 %load.b)
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret i32 %res
+}
+
+; CHECK-LABEL: test_masked_i32
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-MVEFP: call void @llvm.set.loop.iterations
+; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
+define void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr <4 x i32>, <4 x i32>* %a, i32 %count
+  %addr.b = getelementptr <4 x i32>, <4 x i32>* %b, i32 %count
+  %addr.c = getelementptr <4 x i32>, <4 x i32>* %c, i32 %count
+  %load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
+  %load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
+  %res = add <4 x i32> %load.a, %load.b
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %res, <4 x i32>* %addr.c, i32 4, <4 x i1> %mask)
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_masked_f32
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-MVEFP: call void @llvm.set.loop.iterations
+; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
+define void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
+  %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
+  %addr.c = getelementptr <4 x float>, <4 x float>* %c, i32 %count
+  %load.a = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
+  %load.b = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
+  %res = fadd <4 x float> %load.a, %load.b
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %res, <4 x float>* %addr.c, i32 4, <4 x i1> %mask)
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_gather_scatter
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-MVEFP: call void @llvm.set.loop.iterations
+; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
+define void @test_gather_scatter(<4 x i1> %mask, <4 x float*> %a, <4 x float*> %b, <4 x float*> %c, <4 x float> %passthru) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %load.a = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
+  %load.b = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
+  %res = fadd <4 x float> %load.a, %load.b
+  call void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float> %res, <4 x float*> %c, i32 4, <4 x i1> %mask)
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+declare i32 @bar(...) local_unnamed_addr #1
+declare i32 @llvm.arm.smlad(i32, i32, i32)
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare float @llvm.log.f32(float)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare half @llvm.sqrt.f16(half)
+declare float @llvm.sqrt.f32(float)
+declare double @llvm.sqrt.f64(double)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+declare i32 @llvm.sadd.sat.i32(i32, i32)
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
+declare <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
+declare void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/counter.ll
===================================================================
--- llvm/trunk/test/Transforms/HardwareLoops/ARM/counter.ll
+++ llvm/trunk/test/Transforms/HardwareLoops/ARM/counter.ll
@@ -0,0 +1,35 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -o - | FileCheck %s
+
+@g = common local_unnamed_addr global i32* null, align 4
+
+; CHECK-LABEL: counter_too_large
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call i32 @llvm.loop.decrement
+
+define i32 @counter_too_large(i64 %n) {
+entry:
+  %cmp7 = icmp eq i64 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:
+  %0 = load i32*, i32** @g, align 4
+  br label %while.body
+
+while.body:
+  %i.09 = phi i64 [ 0, %while.body.lr.ph ], [ %inc1, %while.body ]
+  %res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
+  %idxprom = trunc i64 %i.09 to i32
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %1, %res.08
+  %inc1 = add nuw i64 %i.09, 1
+  %cmp = icmp ult i64 %inc1, %n
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/do-rem.ll
===================================================================
--- llvm/trunk/test/Transforms/HardwareLoops/ARM/do-rem.ll
+++ llvm/trunk/test/Transforms/HardwareLoops/ARM/do-rem.ll
@@ -0,0 +1,259 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s
+
+@g = common local_unnamed_addr global i32* null, align 4
+
+; CHECK-LABEL: do_with_i32_urem
+; CHECK: while.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+
+define i32 @do_with_i32_urem(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %rem = urem i32 %i.09, 5
+  %add = add i32 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i32_srem
+; CHECK: while.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+
+define i32 @do_with_i32_srem(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %rem = srem i32 %i.09, 5
+  %add = sub i32 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i32_udiv
+; CHECK: while.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+
+define i32 @do_with_i32_udiv(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %rem = udiv i32 %i.09, 5
+  %add = add i32 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i32_sdiv
+; CHECK: while.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+
+define i32 @do_with_i32_sdiv(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %rem = sdiv i32 %i.09, 5
+  %add = sub i32 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i64_urem
+; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.loop.decrement
+define i64 @do_with_i64_urem(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %conv = zext i32 %i.09 to i64
+  %rem = urem i64 %conv, 5
+  %add = add i64 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i64 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i64_srem
+; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.loop.decrement
+define i64 @do_with_i64_srem(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %conv = zext i32 %i.09 to i64
+  %rem = srem i64 %conv, 5
+  %add = sub i64 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i64 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i64_udiv
+; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.loop.decrement
+define i64 @do_with_i64_udiv(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %conv = zext i32 %i.09 to i64
+  %rem = udiv i64 %conv, 5
+  %add = add i64 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i64 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i64_sdiv
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call i32 @llvm.loop.decrement
+define i64 @do_with_i64_sdiv(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %conv = zext i32 %i.09 to i64
+  %rem = sdiv i64 %conv, 5
+  %add = sub i64 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i64 %res.0.lcssa
+}
Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
===================================================================
--- llvm/trunk/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
+++ llvm/trunk/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
@@ -0,0 +1,207 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+soft-float -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SOFT
+
+; CHECK-LABEL: test_fptosi
+; CHECK: while.body.lr.ph:
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
+; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-FP-NEXT: br label %while.body
+
+; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
+
+; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
+
+define void @test_fptosi(i32 %n, i32** %g, double** %d) {
+entry:
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 500
+  br i1 %0, label %while.body.lr.ph, label %cleanup
+
+while.body.lr.ph:
+  %1 = load double*, double** %d, align 4
+  %2 = load i32*, i32** %g, align 4
+  br label %while.body
+
+while.body:
+  %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
+  %rem = urem i32 %i.012, 10
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %if.end4, label %if.then2
+
+if.then2:
+  %arrayidx = getelementptr inbounds double, double* %1, i32 %i.012
+  %3 = load double, double* %arrayidx, align 8
+  %conv = fptosi double %3 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32* %2, i32 %i.012
+  store i32 %conv, i32* %arrayidx3, align 4
+  br label %if.end4
+
+if.end4:
+  %inc = add nuw i32 %i.012, 1
+  %cmp1 = icmp ult i32 %inc, %n
+  br i1 %cmp1, label %while.body, label %cleanup.loopexit
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  ret void
+}
+
+; CHECK-LABEL: test_fptoui
+; CHECK-FP: while.body.lr.ph:
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
+; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-FP-NEXT: br label %while.body
+
+; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
+
+; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
+
+define void @test_fptoui(i32 %n, i32** %g, double** %d) {
+entry:
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 500
+  br i1 %0, label %while.body.lr.ph, label %cleanup
+
+while.body.lr.ph:
+  %1 = load double*, double** %d, align 4
+  %2 = load i32*, i32** %g, align 4
+  br label %while.body
+
+while.body:
+  %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
+  %rem = urem i32 %i.012, 10
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %if.end4, label %if.then2
+
+if.then2:
+  %arrayidx = getelementptr inbounds double, double* %1, i32 %i.012
+  %3 = load double, double* %arrayidx, align 8
+  %conv = fptoui double %3 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32* %2, i32 %i.012
+  store i32 %conv, i32* %arrayidx3, align 4
+  br label %if.end4
+
+if.end4:
+  %inc = add nuw i32 %i.012, 1
+  %cmp1 = icmp ult i32 %inc, %n
+  br i1 %cmp1, label %while.body, label %cleanup.loopexit
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  ret void
+}
+
+; CHECK-LABEL: load_store_float
+; CHECK: while.body.lr.ph:
+; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
+; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
+
+define void @load_store_float(i32 %n, double** %d, double** %g) {
+entry:
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 500
+  br i1 %0, label %while.body.lr.ph, label %cleanup
+
+while.body.lr.ph:
+  %1 = load double*, double** %d, align 4
+  %2 = load double*, double** %g, align 4
+  br label %while.body
+
+while.body:
+  %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
+  %rem = urem i32 %i.012, 10
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %if.end4, label %if.then2
+
+if.then2:
+  %arrayidx = getelementptr inbounds double, double* %1, i32 %i.012
+  %3 = load double, double* %arrayidx, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %2, i32 %i.012
+  store double %3, double* %arrayidx3, align 8
+  br label %if.end4
+
+if.end4:
+  %inc = add nuw i32 %i.012, 1
+  %cmp1 = icmp ult i32 %inc, %n
+  br i1 %cmp1, label %while.body, label %cleanup.loopexit
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  ret void
+}
+
+; CHECK-LABEL: fp_add
+; CHECK: while.body.lr.ph:
+
+; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
+
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
+; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: br label %while.body
+
+; CHECK-SOFT-NOT: call i32 @llvm.loop.decrement
+
+; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
+
+define void @fp_add(i32 %n, float** %d, float** %g) {
+entry:
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 500
+  br i1 %0, label %while.body.lr.ph, label %cleanup
+
+while.body.lr.ph:
+  %1 = load float*, float** %d, align 4
+  %2 = load float*, float** %g, align 4
+  br label %while.body
+
+while.body:
+  %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
+  %rem = urem i32 %i.012, 10
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %if.end4, label %if.then2
+
+if.then2:
+  %arrayidx = getelementptr inbounds float, float* %1, i32 %i.012
+  %3 = load float, float* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %2, i32 %i.012
+  %4 = load float, float* %arrayidx3, align 4
+  %add = fadd float %3, %4
+  store float %add, float* %arrayidx3, align 4
+  br label %if.end4
+
+if.end4:
+  %inc = add nuw i32 %i.012, 1
+  %cmp1 = icmp ult i32 %inc, %n
+  br i1 %cmp1, label %while.body, label %cleanup.loopexit
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  ret void
+}
Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/simple-do.ll
===================================================================
--- llvm/trunk/test/Transforms/HardwareLoops/ARM/simple-do.ll
+++ llvm/trunk/test/Transforms/HardwareLoops/ARM/simple-do.ll
@@ -0,0 +1,155 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=true %s -S -o - | FileCheck %s --check-prefix=DISABLED
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-lob -hardware-loops %s -S -o - | FileCheck %s --check-prefix=DISABLED
+
+; DISABLED-NOT: llvm.set.loop.iterations
+; DISABLED-NOT: llvm.loop.decrement
+
+@g = common local_unnamed_addr global i32* null, align 4
+
+; CHECK-LABEL: do_copy
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %entry ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end
+define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) {
+entry:
+  br label %while.body
+
+while.body:
+  %q.addr.05 = phi i32* [ %incdec.ptr, %while.body ], [ %q, %entry ]
+  %p.addr.04 = phi i32* [ %incdec.ptr1, %while.body ], [ %p, %entry ]
+  %x.addr.03 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+  %dec = add nsw i32 %x.addr.03, -1
+  %incdec.ptr = getelementptr inbounds i32, i32* %q.addr.05, i32 1
+  %0 = load i32, i32* %q.addr.05, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %p.addr.04, i32 1
+  store i32 %0, i32* %p.addr.04, align 4
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:
+  ret i32 0
+}
+
+; CHECK-LABEL: do_inc1
+; CHECK: while.body.lr.ph:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+
+define i32 @do_inc1(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:
+  %0 = load i32*, i32** @g, align 4
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ 0, %while.body.lr.ph ], [ %inc1, %while.body ]
+  %res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.09
+  %1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %1, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_inc2
+; CHECK: while.body.lr.ph:
+; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, -1
+; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[ROUND]], 1
+; CHECK: [[COUNT:%[^ ]+]] = add i32 [[HALVE]], 1
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+define i32 @do_inc2(i32 %n) {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph:
+  %0 = load i32*, i32** @g, align 4
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ 0, %while.body.lr.ph ], [ %add1, %while.body ]
+  %res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.09
+  %1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %1, %res.08
+  %add1 = add nuw nsw i32 %i.09, 2
+  %cmp = icmp slt i32 %add1, %n
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_dec2
+
+; CHECK: while.body.lr.ph:
+; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, 1
+; CHECK: [[CMP:%[^ ]+]] = icmp slt i32 %n, 2
+; CHECK: [[SMIN:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 2
+; CHECK: [[SUB:%[^ ]+]] = sub i32 [[ROUND]], [[SMIN]]
+; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[SUB]], 1
+; CHECK: [[COUNT:%[^ ]+]] = add i32 [[HALVE]], 1
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+define i32 @do_dec2(i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph:
+  %0 = load i32*, i32** @g, align 4
+  br label %while.body
+
+while.body:
+  %i.08 = phi i32 [ %n, %while.body.lr.ph ], [ %sub, %while.body ]
+  %res.07 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.08
+  %1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %1, %res.07
+  %sub = add nsw i32 %i.08, -2
+  %cmp = icmp sgt i32 %i.08, 2
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/structure.ll
===================================================================
--- llvm/trunk/test/Transforms/HardwareLoops/ARM/structure.ll
+++ llvm/trunk/test/Transforms/HardwareLoops/ARM/structure.ll
@@ -0,0 +1,72 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s
+
+; CHECK-LABEL: early_exit
+; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.loop.decrement
+define i32 @early_exit(i32* nocapture readonly %a, i32 %max, i32 %n) {
+entry:
+  br label %do.body
+
+do.body:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.0
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp = icmp sgt i32 %0, %max
+  br i1 %cmp, label %do.end, label %if.end
+
+if.end:
+  %inc = add nuw i32 %i.0, 1
+  %cmp1 = icmp ult i32 %inc, %n
+  br i1 %cmp1, label %do.body, label %if.end.do.end_crit_edge
+
+if.end.do.end_crit_edge:
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i32 %inc
+  %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
+  br label %do.end
+
+do.end:
+  %1 = phi i32 [ %.pre, %if.end.do.end_crit_edge ], [ %0, %do.body ]
+  ret i32 %1
+}
+
+; CHECK-LABEL: nested
+; CHECK-NOT: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
+
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: br label %while.body3.us
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %N, %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body3.us, label %while.cond1.while.end_crit_edge.us
+
+; CHECK-NOT: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NOT: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7
+define void @nested(i32* nocapture %A, i32 %N) {
+entry:
+  %cmp20 = icmp eq i32 %N, 0
+  br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
+
+while.cond1.preheader.us:
+  %i.021.us = phi i32 [ %inc6.us, %while.cond1.while.end_crit_edge.us ], [ 0, %entry ]
+  %mul.us = mul i32 %i.021.us, %N
+  br label %while.body3.us
+
+while.body3.us:
+  %j.019.us = phi i32 [ 0, %while.cond1.preheader.us ], [ %inc.us, %while.body3.us ]
+  %add.us = add i32 %j.019.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+  store i32 %add.us, i32* %arrayidx.us, align 4
+  %inc.us = add nuw i32 %j.019.us, 1
+  %exitcond = icmp eq i32 %inc.us, %N
+  br i1 %exitcond, label %while.cond1.while.end_crit_edge.us, label %while.body3.us
+
+while.cond1.while.end_crit_edge.us:
+  %inc6.us = add nuw i32 %i.021.us, 1
+  %exitcond23 = icmp eq i32 %inc6.us, %N
+  br i1 %exitcond23, label %while.end7, label %while.cond1.preheader.us
+
+while.end7:
+  ret void
+}