diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -50,6 +50,10 @@ "disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops")); +static cl::opt + AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), + cl::desc("Enable the generation of WLS loops")); + extern cl::opt EnableTailPredication; extern cl::opt EnableMaskedGatherScatters; @@ -1690,7 +1694,9 @@ }; // Scan the instructions to see if there's any that we know will turn into a - // call or if this loop is already a low-overhead loop. + // call or if this loop is already a low-overhead loop or will become a tail + // predicated loop. + bool IsTailPredLoop = false; auto ScanLoop = [&](Loop *L) { for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { @@ -1699,6 +1705,13 @@ LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n"); return false; } + if (auto *II = dyn_cast(&I)) + IsTailPredLoop |= + II->getIntrinsicID() == Intrinsic::get_active_lane_mask || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp64; } } return true; @@ -1719,7 +1732,7 @@ LLVMContext &C = L->getHeader()->getContext(); HWLoopInfo.CounterInReg = true; HWLoopInfo.IsNestingLegal = false; - HWLoopInfo.PerformEntryTest = true; + HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop; HWLoopInfo.CountType = Type::getInt32Ty(C); HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); return true; diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -8,28 +8,27 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB0_11 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck -; CHECK-NEXT: add.w r4, r0, r3, lsl #2 -; CHECK-NEXT: add.w r5, r2, r3, lsl #2 -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: mov.w r12, #1 +; CHECK-NEXT: add.w r5, r0, r3, lsl #2 +; CHECK-NEXT: add.w r4, r2, r3, lsl #2 +; CHECK-NEXT: cmp r5, r2 +; CHECK-NEXT: cset r12, hi +; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: cset lr, hi -; CHECK-NEXT: cmp r5, r0 -; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: cmp r4, r1 +; CHECK-NEXT: cmp r5, r1 ; CHECK-NEXT: add.w r5, r1, r3, lsl #2 ; CHECK-NEXT: cset r4, hi ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: ands r5, r4 -; CHECK-NEXT: lsls r5, r5, #31 +; CHECK-NEXT: ands r4, r5 +; CHECK-NEXT: lsls r4, r4, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r6, r6, lr -; CHECK-NEXT: lslseq.w r6, r6, #31 +; CHECK-NEXT: andeq.w r5, lr, r12 +; CHECK-NEXT: lslseq.w r5, r5, #31 ; CHECK-NEXT: beq .LBB0_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r6, r3, #1 +; CHECK-NEXT: subs r5, r3, #1 ; CHECK-NEXT: and r7, r3, #3 -; CHECK-NEXT: cmp r6, #3 +; CHECK-NEXT: cmp r5, #3 ; CHECK-NEXT: bhs .LBB0_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 @@ -48,11 +47,12 @@ ; CHECK-NEXT: b .LBB0_11 ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader.new ; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: add.w lr, r12, r3, lsr #2 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: add.w lr, r5, r3, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, r1, r3 @@ -224,11 +224,11 @@ ; CHECK-NEXT: cbz r2, .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov.w r12, #1 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: add.w lr, r12, r3, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -4,33 +4,33 @@ define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) { ; CHECK-LABEL: arm_var_f32_mve: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: mov r4, r1 -; CHECK-NEXT: cmp r1, #4 -; CHECK-NEXT: it ge -; CHECK-NEXT: movge r4, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r4, r1, r4 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: adds r4, #3 -; CHECK-NEXT: add.w r12, r3, r4, lsr #2 ; CHECK-NEXT: mov r3, r1 ; CHECK-NEXT: dlstp.32 lr, r1 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: .LBB0_1: @ %do.body.i ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r4], #16 +; CHECK-NEXT: vldrw.u32 q1, [r12], #16 ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit ; CHECK-NEXT: vmov s4, r1 -; CHECK-NEXT: dls lr, r12 -; CHECK-NEXT: vadd.f32 s0, s3, s3 ; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: vadd.f32 s0, s3, s3 +; CHECK-NEXT: cmp r1, #4 ; CHECK-NEXT: vcvt.f32.u32 s4, s4 +; CHECK-NEXT: it ge +; CHECK-NEXT: movge r3, #4 +; CHECK-NEXT: subs r3, r1, r3 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: adds r3, #3 +; CHECK-NEXT: add.w lr, lr, r3, lsr #2 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdiv.f32 s0, s0, s4 -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB0_3: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -38,7 +38,7 @@ ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 -; CHECK-NEXT: vsubt.f32 q1, q1, r4 +; CHECK-NEXT: vsubt.f32 q1, q1, r12 ; CHECK-NEXT: vfmat.f32 q0, q1, q1 ; CHECK-NEXT: le lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %do.end @@ -48,7 +48,7 @@ ; CHECK-NEXT: vcvt.f32.u32 s2, s2 ; CHECK-NEXT: vdiv.f32 s0, s0, s2 ; CHECK-NEXT: vstr s0, [r2] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: br label %do.body.i diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -387,28 +387,27 @@ ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB5_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r6, r3, r12, lsl #2 -; CHECK-NEXT: add.w r4, r1, r12 -; CHECK-NEXT: cmp r6, r1 -; CHECK-NEXT: add.w r5, r0, r12 +; CHECK-NEXT: add.w r4, r3, r12, lsl #2 +; CHECK-NEXT: add.w r5, r1, r12 +; CHECK-NEXT: cmp r4, r1 +; CHECK-NEXT: add.w r6, r0, r12 ; CHECK-NEXT: cset lr, hi -; CHECK-NEXT: cmp r4, r3 -; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: cmp r6, r0 -; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: ands r5, r6 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: lsls r5, r5, #31 +; CHECK-NEXT: cmp r4, r0 +; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cset r6, hi +; CHECK-NEXT: ands r4, r6 +; CHECK-NEXT: lsls r4, r4, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r5, r4, lr -; CHECK-NEXT: lslseq.w r5, r5, #31 +; CHECK-NEXT: andeq.w r6, r5, lr +; CHECK-NEXT: lslseq.w r6, r6, #31 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r5, r12, #1 +; CHECK-NEXT: sub.w r6, r12, #1 ; CHECK-NEXT: and r9, r12, #3 -; CHECK-NEXT: cmp r5, #3 +; CHECK-NEXT: cmp r6, #3 ; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 @@ -426,11 +425,12 @@ ; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_11 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r5, r12, #3 +; CHECK-NEXT: bic r6, r12, #3 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: subs r6, #4 ; CHECK-NEXT: add.w r4, r3, #8 -; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, r6, r5, lsr #2 +; CHECK-NEXT: add.w lr, r5, r6, lsr #2 ; CHECK-NEXT: adds r5, r0, #3 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: adds r6, r1, #1 @@ -689,28 +689,27 @@ ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB7_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r6, r3, r12, lsl #2 -; CHECK-NEXT: add.w r4, r1, r12 -; CHECK-NEXT: cmp r6, r1 -; CHECK-NEXT: add.w r5, r0, r12 +; CHECK-NEXT: add.w r4, r3, r12, lsl #2 +; CHECK-NEXT: add.w r5, r1, r12 +; CHECK-NEXT: cmp r4, r1 +; CHECK-NEXT: add.w r6, r0, r12 ; CHECK-NEXT: cset lr, hi -; CHECK-NEXT: cmp r4, r3 -; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: cmp r6, r0 -; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: ands r5, r6 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: lsls r5, r5, #31 +; CHECK-NEXT: cmp r4, r0 +; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cset r6, hi +; CHECK-NEXT: ands r4, r6 +; CHECK-NEXT: lsls r4, r4, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r5, r4, lr -; CHECK-NEXT: lslseq.w r5, r5, #31 +; CHECK-NEXT: andeq.w r6, r5, lr +; CHECK-NEXT: lslseq.w r6, r6, #31 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r5, r12, #1 +; CHECK-NEXT: sub.w r6, r12, #1 ; CHECK-NEXT: and r9, r12, #3 -; CHECK-NEXT: cmp r5, #3 +; CHECK-NEXT: cmp r6, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 @@ -728,11 +727,12 @@ ; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_11 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r5, r12, #3 +; CHECK-NEXT: bic r6, r12, #3 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: subs r6, #4 ; CHECK-NEXT: add.w r4, r3, #8 -; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, r6, r5, lsr #2 +; CHECK-NEXT: add.w lr, r5, r6, lsr #2 ; CHECK-NEXT: adds r5, r0, #3 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: adds r6, r1, #1 @@ -991,28 +991,27 @@ ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB9_11 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck -; CHECK-NEXT: add.w r6, r3, r12, lsl #2 -; CHECK-NEXT: add.w r4, r1, r12, lsl #2 -; CHECK-NEXT: cmp r6, r1 -; CHECK-NEXT: add.w r5, r0, r12, lsl #2 +; CHECK-NEXT: add.w r4, r3, r12, lsl #2 +; CHECK-NEXT: add.w r5, r1, r12, lsl #2 +; CHECK-NEXT: cmp r4, r1 +; CHECK-NEXT: add.w r6, r0, r12, lsl #2 ; CHECK-NEXT: cset lr, hi -; CHECK-NEXT: cmp r4, r3 -; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: cmp r6, r0 -; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: ands r5, r6 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: lsls r5, r5, #31 +; CHECK-NEXT: cmp r4, r0 +; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cset r6, hi +; CHECK-NEXT: ands r4, r6 +; CHECK-NEXT: lsls r4, r4, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r5, r4, lr -; CHECK-NEXT: lslseq.w r5, r5, #31 +; CHECK-NEXT: andeq.w r6, r5, lr +; CHECK-NEXT: lslseq.w r6, r6, #31 ; CHECK-NEXT: beq .LBB9_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r5, r12, #1 +; CHECK-NEXT: sub.w r6, r12, #1 ; CHECK-NEXT: and r9, r12, #3 -; CHECK-NEXT: cmp r5, #3 +; CHECK-NEXT: cmp r6, #3 ; CHECK-NEXT: bhs .LBB9_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 @@ -1030,11 +1029,12 @@ ; CHECK-NEXT: letp lr, .LBB9_5 ; CHECK-NEXT: b .LBB9_11 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r5, r12, #3 +; CHECK-NEXT: bic r6, r12, #3 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: subs r6, #4 ; CHECK-NEXT: add.w r4, r3, #8 -; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, r6, r5, lsr #2 +; CHECK-NEXT: add.w lr, r5, r6, lsr #2 ; CHECK-NEXT: add.w r5, r0, #8 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: add.w r6, r1, #8 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -448,12 +448,12 @@ ; CHECK-NEXT: vaddv.u32 r12, q0 ; CHECK-NEXT: cbz r2, .LBB6_7 ; CHECK-NEXT: @ %bb.4: @ %vector.ph47 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r6, lsr #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r6, lsr #2 ; CHECK-NEXT: vmov.32 q0[0], r12 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_5: @ %vector.body46 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 @@ -643,41 +643,41 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: movw r12, #47184 +; CHECK-NEXT: movw r3, #23593 ; CHECK-NEXT: ldrd r2, lr, [r1, #4] -; CHECK-NEXT: movw r1, #23593 ; CHECK-NEXT: movt r12, #1310 -; CHECK-NEXT: movt r1, #49807 -; CHECK-NEXT: mla r1, lr, r1, r12 -; CHECK-NEXT: movw r3, #55051 +; CHECK-NEXT: movt r3, #49807 +; CHECK-NEXT: mla r3, lr, r3, r12 +; CHECK-NEXT: movw r1, #55051 ; CHECK-NEXT: movw r4, #23593 -; CHECK-NEXT: movt r3, #163 +; CHECK-NEXT: movt r1, #163 ; CHECK-NEXT: ldr r0, [r0] ; CHECK-NEXT: movt r4, #655 -; CHECK-NEXT: ror.w r12, r1, #4 -; CHECK-NEXT: cmp r12, r3 -; CHECK-NEXT: cset r3, lo -; CHECK-NEXT: ror.w r1, r1, #2 +; CHECK-NEXT: ror.w r12, r3, #4 +; CHECK-NEXT: cmp r12, r1 +; CHECK-NEXT: cset r1, lo +; CHECK-NEXT: ror.w r3, r3, #2 ; CHECK-NEXT: mov.w r12, #1 -; CHECK-NEXT: cmp r1, r4 -; CHECK-NEXT: csel r1, r3, r12, lo +; CHECK-NEXT: cmp r3, r4 +; CHECK-NEXT: csel r3, r1, r12, lo ; CHECK-NEXT: lsls.w r4, lr, #30 -; CHECK-NEXT: csel r3, r3, r1, ne +; CHECK-NEXT: csel r1, r1, r3, ne ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB8_1: @ %vector.ph -; CHECK-NEXT: adds r1, r2, #3 +; CHECK-NEXT: movw r3, :lower16:days ; CHECK-NEXT: movs r4, #52 -; CHECK-NEXT: bic r1, r1, #3 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w lr, r12, r1, lsr #2 -; CHECK-NEXT: movw r1, :lower16:days -; CHECK-NEXT: movt r1, :upper16:days -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: mla r1, r3, r4, r1 +; CHECK-NEXT: movt r3, :upper16:days +; CHECK-NEXT: mla r1, r1, r4, r3 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: adds r0, r2, #3 +; CHECK-NEXT: bic r0, r0, #3 +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: add.w lr, r12, r0, lsr #2 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -8,13 +8,18 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: ldr r1, [sp, #8] ; CHECK-NEXT: mov r12, r3 -; CHECK-NEXT: wlstp.32 lr, r1, .LBB0_2 -; CHECK-NEXT: .LBB0_1: @ %do.body +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: add.w lr, r1, #3 +; CHECK-NEXT: cmp.w r3, lr, lsr #2 +; CHECK-NEXT: beq .LBB0_3 +; CHECK-NEXT: @ %bb.1: @ %do.body.preheader +; CHECK-NEXT: dlstp.32 lr, r1 +; CHECK-NEXT: .LBB0_2: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vaddva.s32 r12, q0 -; CHECK-NEXT: letp lr, .LBB0_1 -; CHECK-NEXT: .LBB0_2: @ %if.end +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: .LBB0_3: @ %if.end ; CHECK-NEXT: str.w r12, [r2] ; CHECK-NEXT: pop {r7, pc} entry: @@ -48,52 +53,48 @@ define void @nested(i32* nocapture readonly %x, i32* nocapture readnone %y, i32* nocapture %z, i32 %m, i32 %n) { ; CHECK-LABEL: nested: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: cbz r3, .LBB1_7 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: cbz r3, .LBB1_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr.w r12, [sp, #20] +; CHECK-NEXT: ldr r5, [sp, #24] +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: b .LBB1_3 -; CHECK-NEXT: .LBB1_2: @ %if.end -; CHECK-NEXT: @ in Loop: Header=BB1_3 Depth=1 +; CHECK-NEXT: b .LBB1_4 +; CHECK-NEXT: .LBB1_2: @ in Loop: Header=BB1_4 Depth=1 +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: .LBB1_3: @ %if.end +; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 ; CHECK-NEXT: str.w r4, [r2, r1, lsl #2] ; CHECK-NEXT: adds r1, #1 ; CHECK-NEXT: cmp r1, r3 -; CHECK-NEXT: beq .LBB1_7 -; CHECK-NEXT: .LBB1_3: @ %for.body +; CHECK-NEXT: beq .LBB1_8 +; CHECK-NEXT: .LBB1_4: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB1_5 Depth 2 -; CHECK-NEXT: add.w r5, r12, #3 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: lsr.w lr, r5, #2 -; CHECK-NEXT: cmp.w lr, #0 +; CHECK-NEXT: @ Child Loop BB1_6 Depth 2 +; CHECK-NEXT: adds r7, r5, #3 +; CHECK-NEXT: cmp.w r12, r7, lsr #2 ; CHECK-NEXT: beq .LBB1_2 -; CHECK-NEXT: b .LBB1_4 -; CHECK-NEXT: .LBB1_4: @ %do.body.preheader -; CHECK-NEXT: @ in Loop: Header=BB1_3 Depth=1 -; CHECK-NEXT: bic r6, r5, #3 -; CHECK-NEXT: mov r7, r12 +; CHECK-NEXT: @ %bb.5: @ %do.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 +; CHECK-NEXT: bic r6, r7, #3 +; CHECK-NEXT: dlstp.32 lr, r5 +; CHECK-NEXT: mov r7, r5 +; CHECK-NEXT: add.w r8, r0, r6, lsl #2 ; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: add.w r5, r0, r6, lsl #2 -; CHECK-NEXT: .LBB1_5: @ %do.body -; CHECK-NEXT: @ Parent Loop BB1_3 Depth=1 +; CHECK-NEXT: .LBB1_6: @ %do.body +; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r7 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vaddvat.s32 r4, q0 -; CHECK-NEXT: bne .LBB1_5 -; CHECK-NEXT: b .LBB1_6 -; CHECK-NEXT: .LBB1_6: @ %if.end.loopexit -; CHECK-NEXT: @ in Loop: Header=BB1_3 Depth=1 -; CHECK-NEXT: sub.w r12, r12, r6 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: b .LBB1_2 -; CHECK-NEXT: .LBB1_7: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vaddva.s32 r4, q0 +; CHECK-NEXT: letp lr, .LBB1_6 +; CHECK-NEXT: @ %bb.7: @ %if.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 +; CHECK-NEXT: subs r5, r5, r6 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: b .LBB1_3 +; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %cmp20.not = icmp eq i32 %m, 0 br i1 %cmp20.not, label %for.cond.cleanup, label %for.body diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -264,10 +264,10 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB4_1: @ %vector.ph -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: eor r12, r4, #-2147483648 +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: eor r12, r12, #-2147483648 ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -323,16 +323,15 @@ define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmss2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r4, pc} -; CHECK-NEXT: .LBB5_1: @ %vector.ph -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: blt .LBB5_3 +; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: vmov r6, s0 +; CHECK-NEXT: vdup.32 q0, r6 ; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vneg.f32 q0, q0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -343,8 +342,8 @@ ; CHECK-NEXT: vfma.f32 q3, q2, q1 ; CHECK-NEXT: vstrw.32 q3, [r2], #16 ; CHECK-NEXT: letp lr, .LBB5_2 -; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -390,14 +389,14 @@ define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmss3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB6_1: @ %vector.ph -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vdup.32 q0, r4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB6_2: @ %vector.body @@ -410,7 +409,7 @@ ; CHECK-NEXT: vstrw.32 q3, [r2], #16 ; CHECK-NEXT: letp lr, .LBB6_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -456,14 +455,14 @@ define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmss4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB7_1: @ %vector.ph -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vdup.32 q0, r4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB7_2: @ %vector.body @@ -476,7 +475,7 @@ ; CHECK-NEXT: vstrw.32 q3, [r2], #16 ; CHECK-NEXT: letp lr, .LBB7_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -528,10 +527,10 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB8_1: @ %vector.ph -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: eor r12, r4, #-2147483648 +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: eor r12, r12, #-2147483648 ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -587,14 +586,14 @@ define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fms2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB9_1: @ %vector.ph -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vdup.32 q0, r4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB9_2: @ %vector.body @@ -606,7 +605,7 @@ ; CHECK-NEXT: vstrw.32 q2, [r2], #16 ; CHECK-NEXT: letp lr, .LBB9_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -574,27 +574,19 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r10, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r10, lr} -; CHECK-NEXT: add.w r7, r0, #15 ; CHECK-NEXT: ldr.w r12, [sp, #32] -; CHECK-NEXT: asrs r6, r7, #31 -; CHECK-NEXT: add.w r7, r7, r6, lsr #28 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: asrs r5, r7, #4 -; CHECK-NEXT: cmp r5, #1 -; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r6, r7, #4 ; CHECK-NEXT: cmp r0, #1 ; CHECK-NEXT: blt .LBB4_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: adds r7, r2, r1 ; CHECK-NEXT: add.w r5, r2, r1, lsl #1 -; CHECK-NEXT: add.w r1, r1, r1, lsl #1 -; CHECK-NEXT: dlstp.8 lr, r0 -; CHECK-NEXT: add r1, r2 ; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: adds r7, r2, r1 +; CHECK-NEXT: add.w r1, r1, r1, lsl #1 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: dlstp.8 lr, r0 ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q0, [r3], #16 @@ -735,20 +727,20 @@ ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_7 Depth 2 ; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: cmp r2, r2 ; CHECK-NEXT: ldr.w r12, [r0, r9, lsl #2] -; CHECK-NEXT: subs r0, r2, r2 -; CHECK-NEXT: ble .LBB5_3 +; CHECK-NEXT: bge .LBB5_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 ; CHECK-NEXT: ldr.w r11, [sp, #88] +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r6, r12 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: dlstp.16 lr, r11 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mla r3, r9, r11, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: mla r3, r9, r11, r0 +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: ldrd r7, r5, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: .LBB5_7: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB5_5 Depth=1 @@ -907,20 +899,20 @@ ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_5 Depth 2 ; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: cmp r2, r2 ; CHECK-NEXT: ldr.w r12, [r0, r9, lsl #2] -; CHECK-NEXT: subs r0, r2, r2 -; CHECK-NEXT: ble .LBB6_6 +; CHECK-NEXT: bge .LBB6_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 ; CHECK-NEXT: ldr.w r11, [sp, #88] +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r6, r12 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: dlstp.16 lr, r11 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mla r3, r9, r11, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: mla r3, r9, r11, r0 +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: ldrd r7, r5, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: .LBB6_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB6_3 Depth=1