Index: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h @@ -180,6 +180,12 @@ bool UseMaskForCond = false, bool UseMaskForGaps = false); + bool isLoweredToCall(const Function *F); + bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); Index: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -36,6 +36,10 @@ #define DEBUG_TYPE "armtti" +static cl::opt DisableLowOverheadLoops( + "disable-arm-loloops", cl::Hidden, cl::init(true), + cl::desc("Disable the generation of low-overhead loops")); + bool ARMTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -628,6 +632,196 @@ UseMaskForCond, UseMaskForGaps); } +bool ARMTTIImpl::isLoweredToCall(const Function *F) { + if (!F->isIntrinsic()) + BaseT::isLoweredToCall(F); + + // Assume all Arm-specific intrinsics map to an instruction. + if (F->getName().startswith("llvm.arm")) + return false; + + switch (F->getIntrinsicID()) { + default: break; + case Intrinsic::powi: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::pow: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::exp: + case Intrinsic::exp2: + return true; + case Intrinsic::sqrt: + case Intrinsic::fabs: + case Intrinsic::copysign: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::round: + case Intrinsic::canonicalize: + case Intrinsic::lround: + case Intrinsic::llround: + case Intrinsic::lrint: + case Intrinsic::llrint: + if (F->getReturnType()->isDoubleTy() && !ST->hasFP64()) + return true; + if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16()) + return true; + // Some operations can be handled by vector instructions and assume + // unsupported vectors will be expanded into supported scalar ones. + // TODO Handle scalar operations properly. + return !ST->hasFPARMv8Base() && !ST->hasVFP2Base(); + case Intrinsic::masked_store: + case Intrinsic::masked_load: + case Intrinsic::masked_gather: + case Intrinsic::masked_scatter: + return !ST->hasMVEIntegerOps(); + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::usub_sat: + return false; + } + + return BaseT::isLoweredToCall(F); +} + +bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo) { + // Low-overhead branches are only supported in the 'low-overhead branch' + // extension of v8.1-m. + if (!ST->hasLOB() || DisableLowOverheadLoops) + return false; + + // For now, for simplicity, only support loops with one exit block. + if (!L->getExitBlock()) + return false; + + if (!SE.hasLoopInvariantBackedgeTakenCount(L)) + return false; + + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); + if (isa(BackedgeTakenCount)) + return false; + + const SCEV *TripCountSCEV = + SE.getAddExpr(BackedgeTakenCount, + SE.getOne(BackedgeTakenCount->getType())); + + // We need to store the trip count in LR, a 32-bit register. + if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) + return false; + + // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little + // point in generating a hardware loop if that's going to happen. + auto MaybeCall = [this](Instruction &I) { + const ARMTargetLowering *TLI = getTLI(); + unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); + EVT VT = TLI->getValueType(DL, I.getType(), true); + if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) + return true; + + // Check if an intrinsic will be lowered to a call and assume that any + // other CallInst will generate a bl. + if (auto *Call = dyn_cast(&I)) { + if (isa(Call)) { + if (const Function *F = Call->getCalledFunction()) + return isLoweredToCall(F); + } + return true; + } + + // FPv5 provides conversions between integer, double-precision, + // single-precision, and half-precision formats. + switch (I.getOpcode()) { + default: + break; + case Instruction::FPToSI: + case Instruction::FPToUI: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + return !ST->hasFPARMv8Base(); + } + + // FIXME: Unfortunately the approach of checking the Operation Action does + // not catch all cases of Legalization that use library calls. Our + // Legalization step categorizes some transformations into library calls as + // Custom, Expand or even Legal when doing type legalization. So for now + // we have to special case for instance the SDIV of 64bit integers and the + // use of floating point emulation. + if (VT.isInteger() && VT.getSizeInBits() >= 64) { + switch (ISD) { + default: + break; + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: + case ISD::SDIVREM: + case ISD::UDIVREM: + return true; + } + } + + // Assume all other non-float operations are supported. + if (!VT.isFloatingPoint()) + return false; + + // We'll need a library call to handle most floats when using soft. + if (TLI->useSoftFloat()) { + switch (I.getOpcode()) { + default: + return true; + case Instruction::Alloca: + case Instruction::Load: + case Instruction::Store: + case Instruction::Select: + case Instruction::PHI: + return false; + } + } + + // We'll need a libcall to perform double precision operations on a single + // precision only FPU. + if (I.getType()->isDoubleTy() && !ST->hasFP64()) + return true; + + // Likewise for half precision arithmetic. + if (I.getType()->isHalfTy() && !ST->hasFullFP16()) + return true; + + return false; + }; + + // Scan the instructions to see if there's any that we know will turn into a + // call. + for (auto *BB : L->getBlocks()) + for (auto &I : *BB) + if (MaybeCall(I)) + return false; + + // TODO: Check whether the trip count calculation is expensive. If L is the + // inner loop but we know it has a low trip count, calculating that trip + // count (in the parent loop) may be detrimental. + + LLVMContext &C = L->getHeader()->getContext(); + HWLoopInfo.CounterInReg = true; + HWLoopInfo.CountType = Type::getInt32Ty(C); + HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); + return true; +} + void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/calls.ll =================================================================== --- llvm/trunk/test/Transforms/HardwareLoops/ARM/calls.ll +++ llvm/trunk/test/Transforms/HardwareLoops/ARM/calls.ll @@ -0,0 +1,404 @@ +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MAIN +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8,+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP64 +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP + + +; CHECK-LABEL: skip_call +; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-NOT: call i32 @llvm.loop.decrement + +define i32 @skip_call(i32 %n) { +entry: + %cmp6 = icmp eq i32 %n, 0 + br i1 %cmp6, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %i.08 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ] + %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] + %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #2 + %add = add nsw i32 %call, %res.07 + %inc1 = add nuw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %while.end.loopexit, label %while.body + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i32 %res.0.lcssa +} + +; CHECK-LABEL: test_target_specific +; CHECK: call void @llvm.set.loop.iterations.i32(i32 50) +; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ 50, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %loop, label %exit + +define i32 @test_target_specific(i32* %a, i32* %b) { +entry: + br label %loop +loop: + %acc = phi i32 [ 0, %entry ], [ %res, %loop ] + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr i32, i32* %a, i32 %count + %addr.b = getelementptr i32, i32* %b, i32 %count + %load.a = load i32, i32* %addr.a + %load.b = load i32, i32* %addr.b + %res = call i32 @llvm.arm.smlad(i32 %load.a, i32 %load.b, i32 %acc) + %count.next = add nuw i32 %count, 2 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret i32 %res +} + +; CHECK-LABEL: test_fabs_f16 +; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations +; CHECK-MVE-NOT: call void @llvm.set.loop.iterations +; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) +define void @test_fabs_f16(half* %a, half* %b) { +entry: + br label %loop +loop: + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr half, half* %a, i32 %count + %load.a = load half, half* %addr.a + %abs = call half @llvm.fabs.f16(half %load.a) + %addr.b = getelementptr half, half* %b, i32 %count + store half %abs, half *%addr.b + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret void +} + +; CHECK-LABEL: test_fabs +; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations +; CHECK-MVE-NOT: call void @llvm.set.loop.iterations +; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) +define float @test_fabs(float* %a) { +entry: + br label %loop +loop: + %acc = phi float [ 0.0, %entry ], [ %res, %loop ] + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr float, float* %a, i32 %count + %load.a = load float, float* %addr.a + %abs = call float @llvm.fabs.f32(float %load.a) + %res = fadd float %abs, %acc + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret float %res +} + +; CHECK-LABEL: test_fabs_64 +; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations +; CHECK-MVE-NOT: call void @llvm.set.loop.iterations +; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-FP64: void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100) +define void @test_fabs_64(double* %a, double* %b) { +entry: + br label %loop +loop: + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr double, double* %a, i32 %count + %load.a = load double, double* %addr.a + %abs = call double @llvm.fabs.f64(double %load.a) + %addr.b = getelementptr double, double* %b, i32 %count + store double %abs, double *%addr.b + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret void +} + +; CHECK-LABEL: test_fabs_vec +; CHECK-MVE-NOT: call void @llvm.set.loop.iterations +; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1) +; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit +define <4 x float> @test_fabs_vec(<4 x float>* %a) { +entry: + br label %loop +loop: + %acc = phi <4 x float> [ zeroinitializer, %entry ], [ %res, %loop ] + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count + %load.a = load <4 x float>, <4 x float>* %addr.a + %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %load.a) + %res = fadd <4 x float> %abs, %acc + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret <4 x float> %res +} + +; CHECK-LABEL: test_log +; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-NOT: llvm.loop.decrement +define float @test_log(float* %a) { +entry: + br label %loop +loop: + %acc = phi float [ 0.0, %entry ], [ %res, %loop ] + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr float, float* %a, i32 %count + %load.a = load float, float* %addr.a + %abs = call float @llvm.log.f32(float %load.a) + %res = fadd float %abs, %acc + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret float %res +} + +; CHECK-LABEL: test_sqrt_16 +; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations +; CHECK-MVE-NOT: call void @llvm.set.loop.iterations +; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100) +define void @test_sqrt_16(half* %a, half* %b) { +entry: + br label %loop +loop: + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr half, half* %a, i32 %count + %load.a = load half, half* %addr.a + %sqrt = call half @llvm.sqrt.f16(half %load.a) + %addr.b = getelementptr half, half* %b, i32 %count + store half %sqrt, half *%addr.b + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret void +} +; CHECK-LABEL: test_sqrt +; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations +; CHECK-MVE-NOT: call void @llvm.set.loop.iterations +; CHECK-FP: call void @llvm.set.loop.iterations +; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1) +; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit +define void @test_sqrt(float* %a, float* %b) { +entry: + br label %loop +loop: + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr float, float* %a, i32 %count + %load.a = load float, float* %addr.a + %sqrt = call float @llvm.sqrt.f32(float %load.a) + %addr.b = getelementptr float, float* %b, i32 %count + store float %sqrt, float* %addr.b + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret void +} + +; CHECK-LABEL: test_sqrt_64 +; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations +; CHECK-MVE-NOT: call void @llvm.set.loop.iterations +; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100) +define void @test_sqrt_64(double* %a, double* %b) { +entry: + br label %loop +loop: + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr double, double* %a, i32 %count + %load.a = load double, double* %addr.a + %sqrt = call double @llvm.sqrt.f64(double %load.a) + %addr.b = getelementptr double, double* %b, i32 %count + store double %sqrt, double *%addr.b + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret void +} + +; CHECK-LABEL: test_sqrt_vec +; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations +; CHECK-MVE-NOT: call void @llvm.set.loop.iterations +; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) +define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) { +entry: + br label %loop +loop: + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count + %load.a = load <4 x float>, <4 x float>* %addr.a + %sqrt = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %load.a) + %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count + store <4 x float> %sqrt, <4 x float>* %addr.b + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret void +} + +; CHECK-LABEL: test_overflow +; CHECK: call void @llvm.set.loop.iterations +define i32 @test_overflow(i32* %a, i32* %b) { +entry: + br label %loop +loop: + %acc = phi i32 [ 0, %entry ], [ %res, %loop ] + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr i32, i32* %a, i32 %count + %addr.b = getelementptr i32, i32* %b, i32 %count + %load.a = load i32, i32* %addr.a + %load.b = load i32, i32* %addr.b + %sadd = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %load.a, i32 %load.b) + %res = extractvalue {i32, i1} %sadd, 0 + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret i32 %res +} + +; TODO: We should be able to generate a qadd/sub +; CHECK-LABEL: test_sat +; CHECK: call void @llvm.set.loop.iterations.i32(i32 100) +define i32 @test_sat(i32* %a, i32* %b) { +entry: + br label %loop +loop: + %acc = phi i32 [ 0, %entry ], [ %res, %loop ] + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr i32, i32* %a, i32 %count + %addr.b = getelementptr i32, i32* %b, i32 %count + %load.a = load i32, i32* %addr.a + %load.b = load i32, i32* %addr.b + %res = call i32 @llvm.sadd.sat.i32(i32 %load.a, i32 %load.b) + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret i32 %res +} + +; CHECK-LABEL: test_masked_i32 +; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-MVEFP: call void @llvm.set.loop.iterations +; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1) +; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit +define void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) { +entry: + br label %loop +loop: + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr <4 x i32>, <4 x i32>* %a, i32 %count + %addr.b = getelementptr <4 x i32>, <4 x i32>* %b, i32 %count + %addr.c = getelementptr <4 x i32>, <4 x i32>* %c, i32 %count + %load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %mask, <4 x i32> %passthru) + %load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %mask, <4 x i32> %passthru) + %res = add <4 x i32> %load.a, %load.b + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %res, <4 x i32>* %addr.c, i32 4, <4 x i1> %mask) + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret void +} + +; CHECK-LABEL: test_masked_f32 +; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-MVEFP: call void @llvm.set.loop.iterations +; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1) +; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit +define void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) { +entry: + br label %loop +loop: + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count + %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count + %addr.c = getelementptr <4 x float>, <4 x float>* %c, i32 %count + %load.a = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.a, i32 4, <4 x i1> %mask, <4 x float> %passthru) + %load.b = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.b, i32 4, <4 x i1> %mask, <4 x float> %passthru) + %res = fadd <4 x float> %load.a, %load.b + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %res, <4 x float>* %addr.c, i32 4, <4 x i1> %mask) + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret void +} + +; CHECK-LABEL: test_gather_scatter +; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-MVEFP: call void @llvm.set.loop.iterations +; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1) +; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit +define void @test_gather_scatter(<4 x i1> %mask, <4 x float*> %a, <4 x float*> %b, <4 x float*> %c, <4 x float> %passthru) { +entry: + br label %loop +loop: + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %load.a = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %a, i32 4, <4 x i1> %mask, <4 x float> %passthru) + %load.b = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %b, i32 4, <4 x i1> %mask, <4 x float> %passthru) + %res = fadd <4 x float> %load.a, %load.b + call void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float> %res, <4 x float*> %c, i32 4, <4 x i1> %mask) + %count.next = add nuw i32 %count, 1 + %cmp = icmp ne i32 %count.next, 100 + br i1 %cmp, label %loop, label %exit +exit: + ret void +} + +declare i32 @bar(...) local_unnamed_addr #1 +declare i32 @llvm.arm.smlad(i32, i32, i32) +declare half @llvm.fabs.f16(half) +declare float @llvm.fabs.f32(float) +declare double @llvm.fabs.f64(double) +declare float @llvm.log.f32(float) +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) +declare half @llvm.sqrt.f16(half) +declare float @llvm.sqrt.f32(float) +declare double @llvm.sqrt.f64(double) +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) +declare i32 @llvm.sadd.sat.i32(i32, i32) +declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float>, <4 x float*>, i32, <4 x i1>) Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/counter.ll =================================================================== --- llvm/trunk/test/Transforms/HardwareLoops/ARM/counter.ll +++ llvm/trunk/test/Transforms/HardwareLoops/ARM/counter.ll @@ -0,0 +1,35 @@ +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -o - | FileCheck %s + +@g = common local_unnamed_addr global i32* null, align 4 + +; CHECK-LABEL: counter_too_large +; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-NOT: call i32 @llvm.loop.decrement + +define i32 @counter_too_large(i64 %n) { +entry: + %cmp7 = icmp eq i64 %n, 0 + br i1 %cmp7, label %while.end, label %while.body.lr.ph + +while.body.lr.ph: + %0 = load i32*, i32** @g, align 4 + br label %while.body + +while.body: + %i.09 = phi i64 [ 0, %while.body.lr.ph ], [ %inc1, %while.body ] + %res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ] + %idxprom = trunc i64 %i.09 to i32 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %1, %res.08 + %inc1 = add nuw i64 %i.09, 1 + %cmp = icmp ult i64 %inc1, %n + br i1 %cmp, label %while.body, label %while.end.loopexit + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i32 %res.0.lcssa +} Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/do-rem.ll =================================================================== --- llvm/trunk/test/Transforms/HardwareLoops/ARM/do-rem.ll +++ llvm/trunk/test/Transforms/HardwareLoops/ARM/do-rem.ll @@ -0,0 +1,259 @@ +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s + +@g = common local_unnamed_addr global i32* null, align 4 + +; CHECK-LABEL: do_with_i32_urem +; CHECK: while.body.preheader: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) +; CHECK-NEXT: br label %while.body + +; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit + +define i32 @do_with_i32_urem(i32 %n) { +entry: + %cmp7 = icmp eq i32 %n, 0 + br i1 %cmp7, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ] + %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] + %rem = urem i32 %i.09, 5 + %add = add i32 %rem, %res.08 + %inc1 = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %while.end.loopexit, label %while.body + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i32 %res.0.lcssa +} + +; CHECK-LABEL: do_with_i32_srem +; CHECK: while.body.preheader: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) +; CHECK-NEXT: br label %while.body + +; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit + +define i32 @do_with_i32_srem(i32 %n) { +entry: + %cmp7 = icmp eq i32 %n, 0 + br i1 %cmp7, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ] + %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] + %rem = srem i32 %i.09, 5 + %add = sub i32 %rem, %res.08 + %inc1 = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %while.end.loopexit, label %while.body + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i32 %res.0.lcssa +} + +; CHECK-LABEL: do_with_i32_udiv +; CHECK: while.body.preheader: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) +; CHECK-NEXT: br label %while.body + +; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit + +define i32 @do_with_i32_udiv(i32 %n) { +entry: + %cmp7 = icmp eq i32 %n, 0 + br i1 %cmp7, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ] + %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] + %rem = udiv i32 %i.09, 5 + %add = add i32 %rem, %res.08 + %inc1 = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %while.end.loopexit, label %while.body + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i32 %res.0.lcssa +} + +; CHECK-LABEL: do_with_i32_sdiv +; CHECK: while.body.preheader: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) +; CHECK-NEXT: br label %while.body + +; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit + +define i32 @do_with_i32_sdiv(i32 %n) { +entry: + %cmp7 = icmp eq i32 %n, 0 + br i1 %cmp7, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ] + %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] + %rem = sdiv i32 %i.09, 5 + %add = sub i32 %rem, %res.08 + %inc1 = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %while.end.loopexit, label %while.body + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i32 %res.0.lcssa +} + +; CHECK-LABEL: do_with_i64_urem +; CHECK-NOT: llvm.set.loop.iterations +; CHECK-NOT: llvm.loop.decrement +define i64 @do_with_i64_urem(i32 %n) { +entry: + %cmp7 = icmp eq i32 %n, 0 + br i1 %cmp7, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ] + %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ] + %conv = zext i32 %i.09 to i64 + %rem = urem i64 %conv, 5 + %add = add i64 %rem, %res.08 + %inc1 = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %while.end.loopexit, label %while.body + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i64 %res.0.lcssa +} + +; CHECK-LABEL: do_with_i64_srem +; CHECK-NOT: llvm.set.loop.iterations +; CHECK-NOT: llvm.loop.decrement +define i64 @do_with_i64_srem(i32 %n) { +entry: + %cmp7 = icmp eq i32 %n, 0 + br i1 %cmp7, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ] + %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ] + %conv = zext i32 %i.09 to i64 + %rem = srem i64 %conv, 5 + %add = sub i64 %rem, %res.08 + %inc1 = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %while.end.loopexit, label %while.body + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i64 %res.0.lcssa +} + +; CHECK-LABEL: do_with_i64_udiv +; CHECK-NOT: llvm.set.loop.iterations +; CHECK-NOT: llvm.loop.decrement +define i64 @do_with_i64_udiv(i32 %n) { +entry: + %cmp7 = icmp eq i32 %n, 0 + br i1 %cmp7, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ] + %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ] + %conv = zext i32 %i.09 to i64 + %rem = udiv i64 %conv, 5 + %add = add i64 %rem, %res.08 + %inc1 = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %while.end.loopexit, label %while.body + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i64 %res.0.lcssa +} + +; CHECK-LABEL: do_with_i64_sdiv +; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-NOT: call i32 @llvm.loop.decrement +define i64 @do_with_i64_sdiv(i32 %n) { +entry: + %cmp7 = icmp eq i32 %n, 0 + br i1 %cmp7, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ] + %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ] + %conv = zext i32 %i.09 to i64 + %rem = sdiv i64 %conv, 5 + %add = sub i64 %rem, %res.08 + %inc1 = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %while.end.loopexit, label %while.body + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i64 %res.0.lcssa +} Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/fp-emulation.ll =================================================================== --- llvm/trunk/test/Transforms/HardwareLoops/ARM/fp-emulation.ll +++ llvm/trunk/test/Transforms/HardwareLoops/ARM/fp-emulation.ll @@ -0,0 +1,207 @@ +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+soft-float -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SOFT + +; CHECK-LABEL: test_fptosi +; CHECK: while.body.lr.ph: +; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 +; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 +; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-FP-NEXT: br label %while.body + +; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] +; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit + +; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations + +define void @test_fptosi(i32 %n, i32** %g, double** %d) { +entry: + %n.off = add i32 %n, -1 + %0 = icmp ult i32 %n.off, 500 + br i1 %0, label %while.body.lr.ph, label %cleanup + +while.body.lr.ph: + %1 = load double*, double** %d, align 4 + %2 = load i32*, i32** %g, align 4 + br label %while.body + +while.body: + %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ] + %rem = urem i32 %i.012, 10 + %tobool = icmp eq i32 %rem, 0 + br i1 %tobool, label %if.end4, label %if.then2 + +if.then2: + %arrayidx = getelementptr inbounds double, double* %1, i32 %i.012 + %3 = load double, double* %arrayidx, align 8 + %conv = fptosi double %3 to i32 + %arrayidx3 = getelementptr inbounds i32, i32* %2, i32 %i.012 + store i32 %conv, i32* %arrayidx3, align 4 + br label %if.end4 + +if.end4: + %inc = add nuw i32 %i.012, 1 + %cmp1 = icmp ult i32 %inc, %n + br i1 %cmp1, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: + br label %cleanup + +cleanup: + ret void +} + +; CHECK-LABEL: test_fptoui +; CHECK-FP: while.body.lr.ph: +; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 +; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 +; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-FP-NEXT: br label %while.body + +; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] +; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit + +; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations + +define void @test_fptoui(i32 %n, i32** %g, double** %d) { +entry: + %n.off = add i32 %n, -1 + %0 = icmp ult i32 %n.off, 500 + br i1 %0, label %while.body.lr.ph, label %cleanup + +while.body.lr.ph: + %1 = load double*, double** %d, align 4 + %2 = load i32*, i32** %g, align 4 + br label %while.body + +while.body: + %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ] + %rem = urem i32 %i.012, 10 + %tobool = icmp eq i32 %rem, 0 + br i1 %tobool, label %if.end4, label %if.then2 + +if.then2: + %arrayidx = getelementptr inbounds double, double* %1, i32 %i.012 + %3 = load double, double* %arrayidx, align 8 + %conv = fptoui double %3 to i32 + %arrayidx3 = getelementptr inbounds i32, i32* %2, i32 %i.012 + store i32 %conv, i32* %arrayidx3, align 4 + br label %if.end4 + +if.end4: + %inc = add nuw i32 %i.012, 1 + %cmp1 = icmp ult i32 %inc, %n + br i1 %cmp1, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: + br label %cleanup + +cleanup: + ret void +} + +; CHECK-LABEL: load_store_float +; CHECK: while.body.lr.ph: +; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 +; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 +; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-NEXT: br label %while.body + +; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %while.body, label %cleanup.loopexit + +define void @load_store_float(i32 %n, double** %d, double** %g) { +entry: + %n.off = add i32 %n, -1 + %0 = icmp ult i32 %n.off, 500 + br i1 %0, label %while.body.lr.ph, label %cleanup + +while.body.lr.ph: + %1 = load double*, double** %d, align 4 + %2 = load double*, double** %g, align 4 + br label %while.body + +while.body: + %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ] + %rem = urem i32 %i.012, 10 + %tobool = icmp eq i32 %rem, 0 + br i1 %tobool, label %if.end4, label %if.then2 + +if.then2: + %arrayidx = getelementptr inbounds double, double* %1, i32 %i.012 + %3 = load double, double* %arrayidx, align 8 + %arrayidx3 = getelementptr inbounds double, double* %2, i32 %i.012 + store double %3, double* %arrayidx3, align 8 + br label %if.end4 + +if.end4: + %inc = add nuw i32 %i.012, 1 + %cmp1 = icmp ult i32 %inc, %n + br i1 %cmp1, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: + br label %cleanup + +cleanup: + ret void +} + +; CHECK-LABEL: fp_add +; CHECK: while.body.lr.ph: + +; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations + +; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 +; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 +; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK: br label %while.body + +; CHECK-SOFT-NOT: call i32 @llvm.loop.decrement + +; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] +; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit + +define void @fp_add(i32 %n, float** %d, float** %g) { +entry: + %n.off = add i32 %n, -1 + %0 = icmp ult i32 %n.off, 500 + br i1 %0, label %while.body.lr.ph, label %cleanup + +while.body.lr.ph: + %1 = load float*, float** %d, align 4 + %2 = load float*, float** %g, align 4 + br label %while.body + +while.body: + %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ] + %rem = urem i32 %i.012, 10 + %tobool = icmp eq i32 %rem, 0 + br i1 %tobool, label %if.end4, label %if.then2 + +if.then2: + %arrayidx = getelementptr inbounds float, float* %1, i32 %i.012 + %3 = load float, float* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds float, float* %2, i32 %i.012 + %4 = load float, float* %arrayidx3, align 4 + %add = fadd float %3, %4 + store float %add, float* %arrayidx3, align 4 + br label %if.end4 + +if.end4: + %inc = add nuw i32 %i.012, 1 + %cmp1 = icmp ult i32 %inc, %n + br i1 %cmp1, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: + br label %cleanup + +cleanup: + ret void +} Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/simple-do.ll =================================================================== --- llvm/trunk/test/Transforms/HardwareLoops/ARM/simple-do.ll +++ llvm/trunk/test/Transforms/HardwareLoops/ARM/simple-do.ll @@ -0,0 +1,155 @@ +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=true %s -S -o - | FileCheck %s --check-prefix=DISABLED +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-lob -hardware-loops %s -S -o - | FileCheck %s --check-prefix=DISABLED + +; DISABLED-NOT: llvm.set.loop.iterations +; DISABLED-NOT: llvm.loop.decrement + +@g = common local_unnamed_addr global i32* null, align 4 + +; CHECK-LABEL: do_copy +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) +; CHECK: br label %while.body + +; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %entry ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %while.body, label %while.end +define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) { +entry: + br label %while.body + +while.body: + %q.addr.05 = phi i32* [ %incdec.ptr, %while.body ], [ %q, %entry ] + %p.addr.04 = phi i32* [ %incdec.ptr1, %while.body ], [ %p, %entry ] + %x.addr.03 = phi i32 [ %dec, %while.body ], [ %n, %entry ] + %dec = add nsw i32 %x.addr.03, -1 + %incdec.ptr = getelementptr inbounds i32, i32* %q.addr.05, i32 1 + %0 = load i32, i32* %q.addr.05, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %p.addr.04, i32 1 + store i32 %0, i32* %p.addr.04, align 4 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: + ret i32 0 +} + +; CHECK-LABEL: do_inc1 +; CHECK: while.body.lr.ph: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) +; CHECK-NEXT: br label %while.body + +; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit + +define i32 @do_inc1(i32 %n) { +entry: + %cmp7 = icmp eq i32 %n, 0 + br i1 %cmp7, label %while.end, label %while.body.lr.ph + +while.body.lr.ph: + %0 = load i32*, i32** @g, align 4 + br label %while.body + +while.body: + %i.09 = phi i32 [ 0, %while.body.lr.ph ], [ %inc1, %while.body ] + %res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ] + %arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.09 + %1 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %1, %res.08 + %inc1 = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %while.end.loopexit, label %while.body + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i32 %res.0.lcssa +} + +; CHECK-LABEL: do_inc2 +; CHECK: while.body.lr.ph: +; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, -1 +; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[ROUND]], 1 +; CHECK: [[COUNT:%[^ ]+]] = add i32 [[HALVE]], 1 +; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-NEXT: br label %while.body + +; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit +define i32 @do_inc2(i32 %n) { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: + %0 = load i32*, i32** @g, align 4 + br label %while.body + +while.body: + %i.09 = phi i32 [ 0, %while.body.lr.ph ], [ %add1, %while.body ] + %res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ] + %arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.09 + %1 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %1, %res.08 + %add1 = add nuw nsw i32 %i.09, 2 + %cmp = icmp slt i32 %add1, %n + br i1 %cmp, label %while.body, label %while.end.loopexit + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i32 %res.0.lcssa +} + +; CHECK-LABEL: do_dec2 + +; CHECK: while.body.lr.ph: +; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, 1 +; CHECK: [[CMP:%[^ ]+]] = icmp slt i32 %n, 2 +; CHECK: [[SMIN:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 2 +; CHECK: [[SUB:%[^ ]+]] = sub i32 [[ROUND]], [[SMIN]] +; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[SUB]], 1 +; CHECK: [[COUNT:%[^ ]+]] = add i32 [[HALVE]], 1 +; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-NEXT: br label %while.body + +; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit +define i32 @do_dec2(i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: + %0 = load i32*, i32** @g, align 4 + br label %while.body + +while.body: + %i.08 = phi i32 [ %n, %while.body.lr.ph ], [ %sub, %while.body ] + %res.07 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ] + %arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.08 + %1 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %1, %res.07 + %sub = add nsw i32 %i.08, -2 + %cmp = icmp sgt i32 %i.08, 2 + br i1 %cmp, label %while.body, label %while.end.loopexit + +while.end.loopexit: + br label %while.end + +while.end: + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ] + ret i32 %res.0.lcssa +} Index: llvm/trunk/test/Transforms/HardwareLoops/ARM/structure.ll =================================================================== --- llvm/trunk/test/Transforms/HardwareLoops/ARM/structure.ll +++ llvm/trunk/test/Transforms/HardwareLoops/ARM/structure.ll @@ -0,0 +1,72 @@ +; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s + +; CHECK-LABEL: early_exit +; CHECK-NOT: llvm.set.loop.iterations +; CHECK-NOT: llvm.loop.decrement +define i32 @early_exit(i32* nocapture readonly %a, i32 %max, i32 %n) { +entry: + br label %do.body + +do.body: + %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ] + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.0 + %0 = load i32, i32* %arrayidx, align 4 + %cmp = icmp sgt i32 %0, %max + br i1 %cmp, label %do.end, label %if.end + +if.end: + %inc = add nuw i32 %i.0, 1 + %cmp1 = icmp ult i32 %inc, %n + br i1 %cmp1, label %do.body, label %if.end.do.end_crit_edge + +if.end.do.end_crit_edge: + %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i32 %inc + %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4 + br label %do.end + +do.end: + %1 = phi i32 [ %.pre, %if.end.do.end_crit_edge ], [ %0, %do.body ] + ret i32 %1 +} + +; CHECK-LABEL: nested +; CHECK-NOT: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us + +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: br label %while.body3.us + +; CHECK: [[REM:%[^ ]+]] = phi i32 [ %N, %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %while.body3.us, label %while.cond1.while.end_crit_edge.us + +; CHECK-NOT: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1) +; CHECK-NOT: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7 +define void @nested(i32* nocapture %A, i32 %N) { +entry: + %cmp20 = icmp eq i32 %N, 0 + br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us + +while.cond1.preheader.us: + %i.021.us = phi i32 [ %inc6.us, %while.cond1.while.end_crit_edge.us ], [ 0, %entry ] + %mul.us = mul i32 %i.021.us, %N + br label %while.body3.us + +while.body3.us: + %j.019.us = phi i32 [ 0, %while.cond1.preheader.us ], [ %inc.us, %while.body3.us ] + %add.us = add i32 %j.019.us, %mul.us + %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us + store i32 %add.us, i32* %arrayidx.us, align 4 + %inc.us = add nuw i32 %j.019.us, 1 + %exitcond = icmp eq i32 %inc.us, %N + br i1 %exitcond, label %while.cond1.while.end_crit_edge.us, label %while.body3.us + +while.cond1.while.end_crit_edge.us: + %inc6.us = add nuw i32 %i.021.us, 1 + %exitcond23 = icmp eq i32 %inc6.us, %N + br i1 %exitcond23, label %while.end7, label %while.cond1.preheader.us + +while.end7: + ret void +}