Index: llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1231,7 +1231,7 @@ // for now LSR only handles innermost loops). if (AR->getLoop() != L) { // If the AddRec exists, consider it's register free and leave it alone. - if (isExistingPhi(AR, *SE)) + if (isExistingPhi(AR, *SE) && !TTI->shouldFavorPostInc()) return; // It is bad to allow LSR for current loop to add induction variables @@ -2008,6 +2008,7 @@ void NarrowSearchSpaceByCollapsingUnrolledCode(); void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); void NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); + void NarrowSearchSpaceByFilterPostInc(); void NarrowSearchSpaceByDeletingCostlyFormulas(); void NarrowSearchSpaceByPickingWinnerRegs(); void NarrowSearchSpaceUsingHeuristics(); @@ -4669,6 +4670,54 @@ }); } +/// If we are over the complexity limit, filter out any post-inc prefering +/// variables to only post-inc values. +void LSRInstance::NarrowSearchSpaceByFilterPostInc() { + if (!TTI.shouldFavorPostInc()) + return; + if (EstimateSearchSpaceComplexity() < ComplexityLimit) + return; + + LLVM_DEBUG(dbgs() << "The search space is too complex.\n" + "Narrowing the search space by choosing the lowest " + "register Formula for PostInc Uses.\n"); + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + + if (LU.Kind != LSRUse::Address) + continue; + if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) && + !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType())) + continue; + + size_t MinRegs = std::numeric_limits::max(); + for (const Formula &F : LU.Formulae) + MinRegs = std::min(F.getNumRegs(), MinRegs); + + bool Any = false; + for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms; + ++FIdx) { + Formula &F = LU.Formulae[FIdx]; + if (F.getNumRegs() > MinRegs) { + LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); + dbgs() << "\n"); + LU.DeleteFormula(F); + --FIdx; + --NumForms; + Any = true; + } + } + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + + if (EstimateSearchSpaceComplexity() < ComplexityLimit) + break; + } + + LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); +} + /// The function delete formulas with high registers number expectation. /// Assuming we don't know the value of each formula (already delete /// all inefficient), generate probability of not selecting for each @@ -4879,6 +4928,7 @@ NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); if (FilterSameScaledReg) NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); + NarrowSearchSpaceByFilterPostInc(); if (LSRExpNarrow) NarrowSearchSpaceByDeletingCostlyFormulas(); else Index: llvm/test/CodeGen/Thumb2/mve-float16regloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -707,29 +707,27 @@ define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias nocapture readonly %pOutT1, half* noalias nocapture readonly %pPRT_in, half* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, half *%ina) local_unnamed_addr #0 { ; CHECK-LABEL: test_nested: ; CHECK: @ %bb.0: @ %for.body.us.preheader -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: ldrd lr, r12, [sp, #20] +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: ldrd lr, r12, [sp, #16] ; CHECK-NEXT: lsl.w r3, r12, #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB14_1: @ %for.body.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 ; CHECK-NEXT: ldrh r4, [r1] -; CHECK-NEXT: mov r5, r12 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: vdup.16 q0, r4 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: .LBB14_2: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: adds r6, r0, r4 -; CHECK-NEXT: adds r7, r2, r4 -; CHECK-NEXT: vldrw.u32 q1, [r7] -; CHECK-NEXT: vldrw.u32 q2, [r6] -; CHECK-NEXT: adds r4, #16 -; CHECK-NEXT: subs r5, #8 +; CHECK-NEXT: vldrw.u32 q1, [r5], #16 +; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: subs r6, #8 ; CHECK-NEXT: vfms.f16 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [r6] +; CHECK-NEXT: vstrb.8 q2, [r4], #16 ; CHECK-NEXT: bne .LBB14_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1 @@ -738,7 +736,7 @@ ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: le lr, .LBB14_1 ; CHECK-NEXT: @ %bb.4: @ %for.end14 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} for.body.us.preheader: %in = load half, half* %ina %cmp = icmp sgt i32 %numRows, 0 @@ -797,59 +795,56 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: ldrh.w r10, [r0] -; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: ldrh r5, [r0] ; CHECK-NEXT: ldr.w r12, [r0, #4] -; CHECK-NEXT: sub.w r6, r10, #1 +; CHECK-NEXT: subs r6, r5, #1 ; CHECK-NEXT: cmp r6, #3 ; CHECK-NEXT: bhi .LBB15_6 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: ldr r7, [r0, #8] ; CHECK-NEXT: add.w r4, r12, r6, lsl #1 -; CHECK-NEXT: lsr.w lr, r5, #2 -; CHECK-NEXT: ldrh r3, [r7, #6] +; CHECK-NEXT: lsr.w lr, r3, #2 +; CHECK-NEXT: ldrh.w r8, [r7, #6] ; CHECK-NEXT: ldrh.w r9, [r7, #4] -; CHECK-NEXT: ldrh.w r8, [r7, #2] +; CHECK-NEXT: ldrh r6, [r7, #2] ; CHECK-NEXT: ldrh r7, [r7] ; CHECK-NEXT: wls lr, lr, .LBB15_5 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph -; CHECK-NEXT: strd r5, r10, [sp, #8] @ 8-byte Folded Spill -; CHECK-NEXT: bic r5, r5, #3 -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: str r5, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: bic r5, r3, #3 +; CHECK-NEXT: add.w r10, r12, #2 +; CHECK-NEXT: str r5, [sp] @ 4-byte Spill ; CHECK-NEXT: add.w r5, r2, r5, lsl #1 ; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: .LBB15_3: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r11, r1, r10 -; CHECK-NEXT: add.w r5, r4, r10 +; CHECK-NEXT: vldrw.u32 q0, [r1], #8 +; CHECK-NEXT: sub.w r11, r10, #2 +; CHECK-NEXT: add.w r5, r10, #2 +; CHECK-NEXT: vstrb.8 q0, [r4], #8 ; CHECK-NEXT: vldrw.u32 q0, [r11] -; CHECK-NEXT: vstrw.32 q0, [r5] -; CHECK-NEXT: add.w r5, r12, r10 -; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: adds r6, r5, #2 -; CHECK-NEXT: vldrw.u32 q1, [r6] +; CHECK-NEXT: vldrw.u32 q1, [r10] ; CHECK-NEXT: vmul.f16 q0, q0, r7 -; CHECK-NEXT: vfma.f16 q0, q1, r8 -; CHECK-NEXT: vldrw.u32 q1, [r5, #4] -; CHECK-NEXT: adds r5, #6 -; CHECK-NEXT: vfma.f16 q0, q1, r9 +; CHECK-NEXT: vfma.f16 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: add.w r5, r2, r10 +; CHECK-NEXT: vfma.f16 q0, q1, r9 +; CHECK-NEXT: vldrw.u32 q1, [r10, #4] ; CHECK-NEXT: add.w r10, r10, #8 -; CHECK-NEXT: vfma.f16 q0, q1, r3 -; CHECK-NEXT: vstrw.32 q0, [r5] +; CHECK-NEXT: vfma.f16 q0, q1, r8 +; CHECK-NEXT: vstrb.8 q0, [r2], #8 ; CHECK-NEXT: le lr, .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit -; CHECK-NEXT: add r4, r10 -; CHECK-NEXT: add.w r12, r12, r0, lsl #1 -; CHECK-NEXT: add.w r1, r1, r0, lsl #1 -; CHECK-NEXT: ldm.w sp, {r0, r2, r5, r10} @ 16-byte Folded Reload +; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add.w r12, r12, r2, lsl #1 +; CHECK-NEXT: add.w r1, r1, r2, lsl #1 +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB15_5: @ %while.end -; CHECK-NEXT: and r6, r5, #3 +; CHECK-NEXT: and lr, r3, #3 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vctp.16 r6 +; CHECK-NEXT: vctp.16 lr ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrht.16 q0, [r4] ; CHECK-NEXT: vldrw.u32 q0, [r12] @@ -857,22 +852,22 @@ ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: add.w r1, r12, #6 ; CHECK-NEXT: vmul.f16 q0, q0, r7 -; CHECK-NEXT: vfma.f16 q0, q1, r8 +; CHECK-NEXT: vfma.f16 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q1, [r12, #4] ; CHECK-NEXT: vfma.f16 q0, q1, r9 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vfma.f16 q0, q1, r3 +; CHECK-NEXT: vfma.f16 q0, q1, r8 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrht.16 q0, [r2] ; CHECK-NEXT: ldr.w r12, [r0, #4] ; CHECK-NEXT: .LBB15_6: @ %if.end -; CHECK-NEXT: add.w r0, r12, r5, lsl #1 -; CHECK-NEXT: lsr.w lr, r10, #2 +; CHECK-NEXT: add.w r0, r12, r3, lsl #1 +; CHECK-NEXT: lsr.w lr, r5, #2 ; CHECK-NEXT: wls lr, lr, .LBB15_10 ; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader -; CHECK-NEXT: bic r2, r10, #3 +; CHECK-NEXT: bic r2, r5, #3 +; CHECK-NEXT: adds r1, r2, r3 ; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: adds r1, r2, r5 ; CHECK-NEXT: add.w r1, r12, r1, lsl #1 ; CHECK-NEXT: .LBB15_8: @ %while.body51 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -883,7 +878,7 @@ ; CHECK-NEXT: add.w r12, r12, r2, lsl #1 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: .LBB15_10: @ %while.end55 -; CHECK-NEXT: ands r1, r10, #3 +; CHECK-NEXT: ands r1, r5, #3 ; CHECK-NEXT: beq .LBB15_12 ; CHECK-NEXT: @ %bb.11: @ %if.then59 ; CHECK-NEXT: vldrw.u32 q0, [r0] Index: llvm/test/CodeGen/Thumb2/mve-float32regloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -685,29 +685,27 @@ define dso_local void @test_nested(float* noalias nocapture %pInT1, float* noalias nocapture readonly %pOutT1, float* noalias nocapture readonly %pPRT_in, float* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, float %in) local_unnamed_addr #0 { ; CHECK-LABEL: test_nested: ; CHECK: @ %bb.0: @ %for.body.us.preheader -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: ldrd lr, r12, [sp, #20] +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: ldrd lr, r12, [sp, #16] ; CHECK-NEXT: lsl.w r3, r12, #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB14_1: @ %for.body.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 ; CHECK-NEXT: ldr r4, [r1] -; CHECK-NEXT: mov r5, r12 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: .LBB14_2: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: adds r6, r0, r4 -; CHECK-NEXT: adds r7, r2, r4 -; CHECK-NEXT: vldrw.u32 q1, [r7] -; CHECK-NEXT: vldrw.u32 q2, [r6] -; CHECK-NEXT: adds r4, #16 -; CHECK-NEXT: subs r5, #4 +; CHECK-NEXT: vldrw.u32 q1, [r5], #16 +; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: subs r6, #4 ; CHECK-NEXT: vfms.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [r6] +; CHECK-NEXT: vstrb.8 q2, [r4], #16 ; CHECK-NEXT: bne .LBB14_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1 @@ -716,7 +714,7 @@ ; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: le lr, .LBB14_1 ; CHECK-NEXT: @ %bb.4: @ %for.end14 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} for.body.us.preheader: %cmp = icmp sgt i32 %numRows, 0 tail call void @llvm.assume(i1 %cmp) @@ -772,79 +770,72 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: ldrh r5, [r0] -; CHECK-NEXT: mov r6, r3 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: ldrh.w r9, [r0] +; CHECK-NEXT: mov r11, r1 ; CHECK-NEXT: ldr.w r12, [r0, #4] -; CHECK-NEXT: sub.w lr, r5, #1 -; CHECK-NEXT: cmp.w lr, #3 +; CHECK-NEXT: sub.w r1, r9, #1 +; CHECK-NEXT: cmp r1, #3 ; CHECK-NEXT: bhi .LBB15_6 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: ldr r4, [r0, #8] -; CHECK-NEXT: ldr r3, [r4, #12] -; CHECK-NEXT: ldm.w r4, {r7, r8, r9} -; CHECK-NEXT: add.w r4, r12, lr, lsl #2 -; CHECK-NEXT: lsr.w lr, r6, #2 +; CHECK-NEXT: lsr.w lr, r3, #2 +; CHECK-NEXT: ldrd r7, r6, [r4] +; CHECK-NEXT: ldrd r5, r8, [r4, #8] +; CHECK-NEXT: add.w r4, r12, r1, lsl #2 ; CHECK-NEXT: wls lr, lr, .LBB15_5 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph -; CHECK-NEXT: strd r6, r5, [sp, #8] @ 8-byte Folded Spill -; CHECK-NEXT: bic r5, r6, #3 -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: add.w r5, r2, r5, lsl #2 -; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: bic r1, r3, #3 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: add.w r10, r12, #4 +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: .LBB15_3: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r11, r1, r10 -; CHECK-NEXT: add.w r5, r4, r10 -; CHECK-NEXT: vldrw.u32 q0, [r11] -; CHECK-NEXT: add.w r6, r12, r10 -; CHECK-NEXT: vstrw.32 q0, [r5] -; CHECK-NEXT: add.w r5, r2, r10 -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: vldrw.u32 q1, [r6, #4] -; CHECK-NEXT: vldrw.u32 q2, [r6, #12] -; CHECK-NEXT: add.w r10, r10, #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vstrb.8 q0, [r4], #16 +; CHECK-NEXT: vldrw.u32 q0, [r10, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r10], #16 ; CHECK-NEXT: vmul.f32 q0, q0, r7 -; CHECK-NEXT: vfma.f32 q0, q1, r8 -; CHECK-NEXT: vldrw.u32 q1, [r6, #8] -; CHECK-NEXT: vfma.f32 q0, q1, r9 -; CHECK-NEXT: vfma.f32 q0, q2, r3 -; CHECK-NEXT: vstrw.32 q0, [r5] +; CHECK-NEXT: vldrw.u32 q2, [r10, #-8] +; CHECK-NEXT: vfma.f32 q0, q1, r6 +; CHECK-NEXT: vldrw.u32 q1, [r10, #-12] +; CHECK-NEXT: vfma.f32 q0, q1, r5 +; CHECK-NEXT: vfma.f32 q0, q2, r8 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit -; CHECK-NEXT: ldr r5, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: add r4, r10 -; CHECK-NEXT: add.w r12, r12, r0, lsl #2 -; CHECK-NEXT: add.w r1, r1, r0, lsl #2 -; CHECK-NEXT: ldm.w sp, {r0, r2, r6} @ 12-byte Folded Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add.w r12, r12, r1, lsl #2 +; CHECK-NEXT: add.w r11, r11, r1, lsl #2 ; CHECK-NEXT: .LBB15_5: @ %while.end -; CHECK-NEXT: and lr, r6, #3 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vctp.32 lr +; CHECK-NEXT: and r1, r3, #3 +; CHECK-NEXT: vldrw.u32 q0, [r11] +; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r4] ; CHECK-NEXT: vldrw.u32 q0, [r12] ; CHECK-NEXT: vldrw.u32 q1, [r12, #4] ; CHECK-NEXT: vmul.f32 q0, q0, r7 -; CHECK-NEXT: vfma.f32 q0, q1, r8 +; CHECK-NEXT: vfma.f32 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q1, [r12, #8] -; CHECK-NEXT: vfma.f32 q0, q1, r9 +; CHECK-NEXT: vfma.f32 q0, q1, r5 ; CHECK-NEXT: vldrw.u32 q1, [r12, #12] -; CHECK-NEXT: vfma.f32 q0, q1, r3 +; CHECK-NEXT: vfma.f32 q0, q1, r8 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r2] ; CHECK-NEXT: ldr.w r12, [r0, #4] ; CHECK-NEXT: .LBB15_6: @ %if.end -; CHECK-NEXT: add.w r0, r12, r6, lsl #2 -; CHECK-NEXT: lsr.w lr, r5, #2 +; CHECK-NEXT: add.w r0, r12, r3, lsl #2 +; CHECK-NEXT: lsr.w lr, r9, #2 ; CHECK-NEXT: wls lr, lr, .LBB15_10 ; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader -; CHECK-NEXT: bic r2, r5, #3 +; CHECK-NEXT: bic r2, r9, #3 +; CHECK-NEXT: adds r1, r2, r3 ; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: adds r1, r2, r6 ; CHECK-NEXT: add.w r1, r12, r1, lsl #2 ; CHECK-NEXT: .LBB15_8: @ %while.body51 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -855,7 +846,7 @@ ; CHECK-NEXT: add.w r12, r12, r2, lsl #2 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: .LBB15_10: @ %while.end55 -; CHECK-NEXT: ands r1, r5, #3 +; CHECK-NEXT: ands r1, r9, #3 ; CHECK-NEXT: beq .LBB15_12 ; CHECK-NEXT: @ %bb.11: @ %if.then59 ; CHECK-NEXT: vldrw.u32 q0, [r0] @@ -863,7 +854,7 @@ ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r12] ; CHECK-NEXT: .LBB15_12: @ %if.end61 -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 @@ -1399,10 +1390,12 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biquad_cascade_stereo_df2T_instance_f32* nocapture readonly %0, float* %1, float* %2, i32 %3) { ; CHECK-LABEL: arm_biquad_cascade_stereo_df2T_f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: .pad #24 ; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: ldrb.w lr, [r0] @@ -1417,39 +1410,39 @@ ; CHECK-NEXT: mov r4, sp ; CHECK-NEXT: .LBB17_2: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB17_3 Depth 2 -; CHECK-NEXT: vldr s4, [r0, #4] -; CHECK-NEXT: vldrw.u32 q3, [r12] -; CHECK-NEXT: vldr s8, [r0, #12] -; CHECK-NEXT: mov r6, lr -; CHECK-NEXT: vldr s6, [r0, #8] +; CHECK-NEXT: mov r7, lr +; CHECK-NEXT: ldr.w lr, [r0, #12] +; CHECK-NEXT: ldrd r5, r6, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vldr s12, [r0, #8] +; CHECK-NEXT: vdup.32 q2, lr +; CHECK-NEXT: vldr s14, [r0, #16] +; CHECK-NEXT: vstrw.32 q1, [r4] +; CHECK-NEXT: vdup.32 q1, r6 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: vmov.f32 s6, s12 ; CHECK-NEXT: dls lr, r3 -; CHECK-NEXT: vldr s10, [r0, #16] -; CHECK-NEXT: vmov.f32 s5, s4 -; CHECK-NEXT: vmov.f32 s9, s8 -; CHECK-NEXT: ldr r5, [r0] -; CHECK-NEXT: vmov.f32 s7, s6 -; CHECK-NEXT: vstrw.32 q3, [r4] -; CHECK-NEXT: vmov.f32 s11, s10 -; CHECK-NEXT: vdup.32 q3, r5 -; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s7, s12 +; CHECK-NEXT: vmov.f32 s11, s14 ; CHECK-NEXT: .LBB17_3: @ Parent Loop BB17_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q5, [r1, q0, uxtw #2] -; CHECK-NEXT: vldrw.u32 q6, [r4, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q4, [r1, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q5, [r4, q0, uxtw #2] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vfma.f32 q6, q5, q3 -; CHECK-NEXT: vstmia r5, {s24, s25} -; CHECK-NEXT: adds r5, #8 -; CHECK-NEXT: vldrw.u32 q4, [sp, #8] -; CHECK-NEXT: vfma.f32 q4, q6, q2 -; CHECK-NEXT: vfma.f32 q4, q5, q1 -; CHECK-NEXT: vstrw.32 q4, [r4] +; CHECK-NEXT: vfma.f32 q5, q4, r5 +; CHECK-NEXT: vstmia r6, {s20, s21} +; CHECK-NEXT: adds r6, #8 +; CHECK-NEXT: vldrw.u32 q3, [sp, #8] +; CHECK-NEXT: vfma.f32 q3, q5, q2 +; CHECK-NEXT: vfma.f32 q3, q4, q1 +; CHECK-NEXT: vstrw.32 q3, [r4] ; CHECK-NEXT: le lr, .LBB17_3 ; CHECK-NEXT: @ %bb.4: @ in Loop: Header=BB17_2 Depth=1 -; CHECK-NEXT: mov lr, r6 +; CHECK-NEXT: mov lr, r7 ; CHECK-NEXT: adds r0, #20 -; CHECK-NEXT: subs.w lr, r6, #1 -; CHECK-NEXT: vstrb.8 q4, [r12], #16 +; CHECK-NEXT: subs.w lr, r7, #1 +; CHECK-NEXT: vstrb.8 q3, [r12], #16 ; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: bne .LBB17_2 ; CHECK-NEXT: b .LBB17_7 @@ -1462,8 +1455,9 @@ ; CHECK-NEXT: le lr, .LBB17_6 ; CHECK-NEXT: .LBB17_7: ; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} %5 = alloca [6 x float], align 4 %6 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 1 %7 = load float*, float** %6, align 4