diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15512,13 +15512,25 @@ if (!Subtarget->hasMVEIntegerOps()) return false; - auto IsSinker = [](Instruction *I, int Operand) { + auto IsFMSMul = [&](Instruction *I) { + if (!I->hasOneUse()) + return false; + auto *Sub = cast(*I->users().begin()); + return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; + }; + + auto IsSinker = [&](Instruction *I, int Operand) { switch (I->getOpcode()) { case Instruction::Add: case Instruction::Mul: + case Instruction::FAdd: case Instruction::ICmp: + case Instruction::FCmp: return true; + case Instruction::FMul: + return !IsFMSMul(I); case Instruction::Sub: + case Instruction::FSub: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -9,13 +9,12 @@ ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r1] ; CHECK-NEXT: vmov.f16 r1, s0 -; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vadd.f16 q1, q1, q0 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vadd.f16 q0, q0, r1 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -57,13 +56,12 @@ ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r1] ; CHECK-NEXT: vmov.f16 r1, s0 -; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vadd.f16 q1, q0, q1 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vadd.f16 q0, q0, r1 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: bne .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -105,13 +103,12 @@ ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r1] ; CHECK-NEXT: vmov.f16 r1, s0 -; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vmul.f16 q1, q1, q0 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vmul.f16 q0, q0, r1 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: bne .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -153,13 +150,12 @@ ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r1] ; CHECK-NEXT: vmov.f16 r1, s0 -; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vmul.f16 q1, q0, q1 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vmul.f16 q0, q0, r1 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: bne .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -201,13 +197,12 @@ ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r1] ; CHECK-NEXT: vmov.f16 r1, s0 -; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vsub.f16 q1, q1, q0 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vsub.f16 q0, q0, r1 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: bne .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -299,15 +294,13 @@ ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] ; CHECK-NEXT: vmov.f16 r2, s0 -; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs.w r12, r12, #8 -; CHECK-NEXT: vfma.f16 q3, q2, q1 -; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: vfmas.f16 q1, q0, r2 +; CHECK-NEXT: vstrb.8 q1, [r3], #16 ; CHECK-NEXT: bne .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -354,15 +347,13 @@ ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] ; CHECK-NEXT: vmov.f16 r2, s0 -; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs.w r12, r12, #8 -; CHECK-NEXT: vfma.f16 q3, q2, q1 -; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: vfmas.f16 q1, q0, r2 +; CHECK-NEXT: vstrb.8 q1, [r3], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -409,14 +400,13 @@ ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] ; CHECK-NEXT: vmov.f16 r2, s0 -; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs.w r12, r12, #8 -; CHECK-NEXT: vfma.f16 q2, q1, q0 -; CHECK-NEXT: vstrb.8 q2, [r3], #16 +; CHECK-NEXT: vfma.f16 q1, q0, r2 +; CHECK-NEXT: vstrb.8 q1, [r3], #16 ; CHECK-NEXT: bne .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -463,14 +453,13 @@ ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] ; CHECK-NEXT: vmov.f16 r2, s0 -; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs.w r12, r12, #8 -; CHECK-NEXT: vfma.f16 q2, q0, q1 -; CHECK-NEXT: vstrb.8 q2, [r3], #16 +; CHECK-NEXT: vfma.f16 q1, q0, r2 +; CHECK-NEXT: vstrb.8 q1, [r3], #16 ; CHECK-NEXT: bne .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -629,15 +618,14 @@ ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] ; CHECK-NEXT: vmov.f16 r2, s0 -; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB12_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: subs.w r12, r12, #8 -; CHECK-NEXT: vneg.f16 q1, q1 -; CHECK-NEXT: vfma.f16 q1, q2, q0 -; CHECK-NEXT: vstrb.8 q1, [r3], #16 +; CHECK-NEXT: vneg.f16 q0, q0 +; CHECK-NEXT: vfma.f16 q0, q1, r2 +; CHECK-NEXT: vstrb.8 q0, [r3], #16 ; CHECK-NEXT: bne .LBB12_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -684,15 +672,14 @@ ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] ; CHECK-NEXT: vmov.f16 r2, s0 -; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB13_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: subs.w r12, r12, #8 -; CHECK-NEXT: vneg.f16 q1, q1 -; CHECK-NEXT: vfma.f16 q1, q0, q2 -; CHECK-NEXT: vstrb.8 q1, [r3], #16 +; CHECK-NEXT: vneg.f16 q0, q0 +; CHECK-NEXT: vfma.f16 q0, q1, r2 +; CHECK-NEXT: vstrb.8 q0, [r3], #16 ; CHECK-NEXT: bne .LBB13_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -820,105 +807,116 @@ define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* %pDst, i32 %blockSize) { ; CHECK-LABEL: arm_fir_f32_1_4_mve: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: ldrh.w r10, [r0] -; CHECK-NEXT: ldr.w r12, [r0, #4] -; CHECK-NEXT: sub.w r7, r10, #1 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: ldr r5, [r0, #4] +; CHECK-NEXT: subs r7, r4, #1 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhi .LBB15_6 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: ldr r6, [r0, #8] +; CHECK-NEXT: add.w r11, r5, r7, lsl #1 ; CHECK-NEXT: lsr.w lr, r3, #2 -; CHECK-NEXT: ldrh r4, [r6, #6] -; CHECK-NEXT: vdup.16 q0, r4 -; CHECK-NEXT: ldrh r4, [r6, #4] -; CHECK-NEXT: vdup.16 q1, r4 -; CHECK-NEXT: ldrh r4, [r6, #2] -; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: vdup.16 q2, r4 -; CHECK-NEXT: add.w r4, r12, r7, lsl #1 -; CHECK-NEXT: vdup.16 q3, r6 +; CHECK-NEXT: vldr.16 s0, [r6, #6] +; CHECK-NEXT: vldr.16 s2, [r6, #4] +; CHECK-NEXT: vldr.16 s4, [r6, #2] +; CHECK-NEXT: vldr.16 s6, [r6] ; CHECK-NEXT: wls lr, lr, .LBB15_5 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph -; CHECK-NEXT: bic r9, r3, #3 +; CHECK-NEXT: strd r3, r4, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: vmov.f16 r10, s6 +; CHECK-NEXT: vmov.f16 r12, s4 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: vmov.f16 r4, s2 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: vmov.f16 r8, s0 +; CHECK-NEXT: add.w r3, r2, r3, lsl #1 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: add.w r8, r2, r9, lsl #1 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: .LBB15_3: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r7, r1, r6 -; CHECK-NEXT: vldrw.u32 q4, [r7] -; CHECK-NEXT: adds r7, r4, r6 -; CHECK-NEXT: vstrw.32 q4, [r7] -; CHECK-NEXT: add.w r7, r12, r6 -; CHECK-NEXT: vldrw.u32 q4, [r7] +; CHECK-NEXT: add.w r9, r1, r6 +; CHECK-NEXT: add.w r7, r11, r6 +; CHECK-NEXT: vldrw.u32 q2, [r9] +; CHECK-NEXT: vstrw.32 q2, [r7] +; CHECK-NEXT: adds r7, r3, r6 +; CHECK-NEXT: vldrw.u32 q2, [r7] ; CHECK-NEXT: adds r5, r7, #2 -; CHECK-NEXT: vldrw.u32 q5, [r5] +; CHECK-NEXT: vldrw.u32 q3, [r5] ; CHECK-NEXT: adds r5, r7, #6 -; CHECK-NEXT: vmul.f16 q4, q4, q3 -; CHECK-NEXT: vfma.f16 q4, q5, q2 -; CHECK-NEXT: vldrw.u32 q5, [r7, #4] -; CHECK-NEXT: vfma.f16 q4, q5, q1 -; CHECK-NEXT: vldrw.u32 q5, [r5] +; CHECK-NEXT: vmul.f16 q2, q2, r10 +; CHECK-NEXT: vfma.f16 q2, q3, r12 +; CHECK-NEXT: vldrw.u32 q3, [r7, #4] +; CHECK-NEXT: vfma.f16 q2, q3, r4 +; CHECK-NEXT: vldrw.u32 q3, [r5] ; CHECK-NEXT: adds r5, r2, r6 ; CHECK-NEXT: adds r6, #8 -; CHECK-NEXT: vfma.f16 q4, q5, q0 -; CHECK-NEXT: vstrw.32 q4, [r5] +; CHECK-NEXT: vfma.f16 q2, q3, r8 +; CHECK-NEXT: vstrw.32 q2, [r5] ; CHECK-NEXT: le lr, .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit -; CHECK-NEXT: add r4, r6 -; CHECK-NEXT: add.w r12, r12, r9, lsl #1 -; CHECK-NEXT: add.w r1, r1, r9, lsl #1 -; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: add r4, sp, #4 +; CHECK-NEXT: add r11, r6 +; CHECK-NEXT: add.w r5, r3, r2, lsl #1 +; CHECK-NEXT: add.w r1, r1, r2, lsl #1 +; CHECK-NEXT: ldm r4, {r2, r3, r4} @ 12-byte Folded Reload ; CHECK-NEXT: .LBB15_5: @ %while.end ; CHECK-NEXT: and r7, r3, #3 -; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: vctp.16 r7 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q4, [r4] -; CHECK-NEXT: vldrw.u32 q4, [r12] -; CHECK-NEXT: add.w r1, r12, #2 -; CHECK-NEXT: vmul.f16 q3, q4, q3 -; CHECK-NEXT: vldrw.u32 q4, [r1] -; CHECK-NEXT: add.w r1, r12, #6 -; CHECK-NEXT: vfma.f16 q3, q4, q2 -; CHECK-NEXT: vldrw.u32 q2, [r12, #4] -; CHECK-NEXT: vfma.f16 q3, q2, q1 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vfma.f16 q3, q1, q0 +; CHECK-NEXT: vstrht.16 q2, [r11] +; CHECK-NEXT: vldrw.u32 q2, [r5] +; CHECK-NEXT: vmov.f16 r1, s6 +; CHECK-NEXT: adds r7, r5, #2 +; CHECK-NEXT: vmul.f16 q2, q2, r1 +; CHECK-NEXT: vmov.f16 r1, s4 +; CHECK-NEXT: vldrw.u32 q1, [r7] +; CHECK-NEXT: adds r7, r5, #6 +; CHECK-NEXT: vfma.f16 q2, q1, r1 +; CHECK-NEXT: vldrw.u32 q1, [r5, #4] +; CHECK-NEXT: vmov.f16 r1, s2 +; CHECK-NEXT: vfma.f16 q2, q1, r1 +; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: vldrw.u32 q0, [r7] +; CHECK-NEXT: vfma.f16 q2, q0, r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q3, [r2] -; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: vstrht.16 q2, [r2] +; CHECK-NEXT: ldr r5, [r0, #4] ; CHECK-NEXT: .LBB15_6: @ %if.end -; CHECK-NEXT: add.w r0, r12, r3, lsl #1 -; CHECK-NEXT: lsr.w lr, r10, #2 +; CHECK-NEXT: add.w r0, r5, r3, lsl #1 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: lsr.w lr, r4, #2 ; CHECK-NEXT: wls lr, lr, .LBB15_10 ; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader -; CHECK-NEXT: bic r2, r10, #3 -; CHECK-NEXT: adds r1, r2, r3 -; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: add.w r1, r12, r1, lsl #1 +; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: adds r1, r7, r3 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: add.w r1, r2, r1, lsl #1 ; CHECK-NEXT: .LBB15_8: @ %while.body51 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #8 ; CHECK-NEXT: vstrb.8 q0, [r3], #8 ; CHECK-NEXT: le lr, .LBB15_8 ; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit -; CHECK-NEXT: add.w r12, r12, r2, lsl #1 +; CHECK-NEXT: add.w r2, r2, r7, lsl #1 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: .LBB15_10: @ %while.end55 -; CHECK-NEXT: ands r1, r10, #3 +; CHECK-NEXT: ands r1, r4, #3 ; CHECK-NEXT: beq .LBB15_12 ; CHECK-NEXT: @ %bb.11: @ %if.then59 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vctp.16 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q0, [r12] +; CHECK-NEXT: vstrht.16 q0, [r2] ; CHECK-NEXT: .LBB15_12: @ %if.end61 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 %0 = load half*, half** %pState1, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -8,13 +8,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.f32 q1, q1, q0 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vadd.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -54,13 +53,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.f32 q1, q0, q1 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vadd.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -100,13 +98,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vmul.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -146,13 +143,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmul.f32 q1, q0, q1 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vmul.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -192,13 +188,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vsub.f32 q1, q1, q0 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vsub.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -285,15 +280,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: vfmas.f32 q1, q0, r12 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -337,15 +330,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: vfmas.f32 q1, q0, r12 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -389,14 +380,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q2, q1, q0 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -440,14 +430,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q2, q0, q1 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -597,15 +586,14 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: .LBB12_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vneg.f32 q1, q1 -; CHECK-NEXT: vfma.f32 q1, q2, q0 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: vfma.f32 q0, q1, r12 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: bne .LBB12_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -649,15 +637,14 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: .LBB13_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vneg.f32 q1, q1 -; CHECK-NEXT: vfma.f32 q1, q0, q2 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: vfma.f32 q0, q1, r12 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: bne .LBB13_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -783,99 +770,118 @@ define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* %pDst, i32 %blockSize) { ; CHECK-LABEL: arm_fir_f32_1_4_mve: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: ldrh.w r9, [r0] -; CHECK-NEXT: ldr.w r12, [r0, #4] -; CHECK-NEXT: sub.w r7, r9, #1 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: ldr.w r11, [r0, #4] +; CHECK-NEXT: subs r7, r4, #1 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhi .LBB15_6 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: ldr r6, [r0, #8] -; CHECK-NEXT: add.w r4, r12, r7, lsl #2 -; CHECK-NEXT: ldrd lr, r8, [r6] -; CHECK-NEXT: ldrd r5, r6, [r6, #8] -; CHECK-NEXT: vdup.32 q3, lr -; CHECK-NEXT: vdup.32 q2, r8 -; CHECK-NEXT: vdup.32 q0, r6 -; CHECK-NEXT: vdup.32 q1, r5 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: add.w r0, r11, r7, lsl #2 ; CHECK-NEXT: lsr.w lr, r3, #2 +; CHECK-NEXT: vldr s0, [r6] +; CHECK-NEXT: vldr s2, [r6, #4] +; CHECK-NEXT: vldr s4, [r6, #8] +; CHECK-NEXT: vldr s6, [r6, #12] ; CHECK-NEXT: wls lr, lr, .LBB15_5 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph -; CHECK-NEXT: bic r10, r3, #3 +; CHECK-NEXT: strd r5, r3, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: vmov r8, s0 ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: add.w r8, r2, r10, lsl #2 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r3, r2, r3, lsl #2 +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: .LBB15_3: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r7, r1, r6 -; CHECK-NEXT: adds r5, r2, r6 -; CHECK-NEXT: vldrw.u32 q4, [r7] -; CHECK-NEXT: adds r7, r4, r6 -; CHECK-NEXT: vstrw.32 q4, [r7] -; CHECK-NEXT: add.w r7, r12, r6 -; CHECK-NEXT: vldrw.u32 q4, [r7] -; CHECK-NEXT: vldrw.u32 q5, [r7, #4] -; CHECK-NEXT: vldrw.u32 q6, [r7, #12] +; CHECK-NEXT: add.w r10, r1, r6 +; CHECK-NEXT: adds r3, r0, r6 +; CHECK-NEXT: vldrw.u32 q2, [r10] +; CHECK-NEXT: add.w r12, r11, r6 +; CHECK-NEXT: add.w r9, r2, r6 ; CHECK-NEXT: adds r6, #16 -; CHECK-NEXT: vmul.f32 q4, q4, q3 -; CHECK-NEXT: vfma.f32 q4, q5, q2 -; CHECK-NEXT: vldrw.u32 q5, [r7, #8] -; CHECK-NEXT: vfma.f32 q4, q5, q1 -; CHECK-NEXT: vfma.f32 q4, q6, q0 -; CHECK-NEXT: vstrw.32 q4, [r5] +; CHECK-NEXT: vstrw.32 q2, [r3] +; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: vldrw.u32 q3, [r12, #4] +; CHECK-NEXT: vldrw.u32 q4, [r12, #12] +; CHECK-NEXT: vmul.f32 q2, q2, r8 +; CHECK-NEXT: vfma.f32 q2, q3, r4 +; CHECK-NEXT: vldrw.u32 q3, [r12, #8] +; CHECK-NEXT: vfma.f32 q2, q3, r7 +; CHECK-NEXT: vfma.f32 q2, q4, r5 +; CHECK-NEXT: vstrw.32 q2, [r9] ; CHECK-NEXT: le lr, .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit -; CHECK-NEXT: add r4, r6 -; CHECK-NEXT: add.w r12, r12, r10, lsl #2 -; CHECK-NEXT: add.w r1, r1, r10, lsl #2 -; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add r0, r6 +; CHECK-NEXT: ldrd r3, r4, [sp, #16] @ 8-byte Folded Reload +; CHECK-NEXT: add.w r11, r11, r2, lsl #2 +; CHECK-NEXT: add.w r1, r1, r2, lsl #2 +; CHECK-NEXT: ldrd r2, r5, [sp, #8] @ 8-byte Folded Reload ; CHECK-NEXT: .LBB15_5: @ %while.end -; CHECK-NEXT: and r7, r3, #3 -; CHECK-NEXT: vldrw.u32 q4, [r1] -; CHECK-NEXT: vctp.32 r7 +; CHECK-NEXT: and r6, r3, #3 +; CHECK-NEXT: vmov r12, s6 +; CHECK-NEXT: vmov lr, s4 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vctp.32 r6 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q4, [r4] -; CHECK-NEXT: vldrw.u32 q4, [r12] -; CHECK-NEXT: vmul.f32 q3, q4, q3 -; CHECK-NEXT: vldrw.u32 q4, [r12, #4] -; CHECK-NEXT: vfma.f32 q3, q4, q2 -; CHECK-NEXT: vldrw.u32 q2, [r12, #8] -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vldrw.u32 q1, [r12, #12] -; CHECK-NEXT: vfma.f32 q3, q1, q0 +; CHECK-NEXT: vstrwt.32 q1, [r0] +; CHECK-NEXT: vmov r9, s2 +; CHECK-NEXT: vldrw.u32 q1, [r11, #4] +; CHECK-NEXT: vmov r7, s0 +; CHECK-NEXT: vldrw.u32 q0, [r11] +; CHECK-NEXT: vmul.f32 q0, q0, r7 +; CHECK-NEXT: vfma.f32 q0, q1, r9 +; CHECK-NEXT: vldrw.u32 q1, [r11, #8] +; CHECK-NEXT: vfma.f32 q0, q1, lr +; CHECK-NEXT: vldrw.u32 q1, [r11, #12] +; CHECK-NEXT: vfma.f32 q0, q1, r12 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q3, [r2] -; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: vstrwt.32 q0, [r2] +; CHECK-NEXT: ldr.w r11, [r5, #4] ; CHECK-NEXT: .LBB15_6: @ %if.end -; CHECK-NEXT: add.w r0, r12, r3, lsl #2 -; CHECK-NEXT: lsr.w lr, r9, #2 +; CHECK-NEXT: add.w r0, r11, r3, lsl #2 +; CHECK-NEXT: lsr.w lr, r4, #2 ; CHECK-NEXT: wls lr, lr, .LBB15_10 ; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader -; CHECK-NEXT: bic r2, r9, #3 -; CHECK-NEXT: adds r1, r2, r3 -; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: add.w r1, r12, r1, lsl #2 +; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: adds r1, r7, r3 +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: add.w r1, r11, r1, lsl #2 ; CHECK-NEXT: .LBB15_8: @ %while.body51 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vstrb.8 q0, [r3], #16 ; CHECK-NEXT: le lr, .LBB15_8 ; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit -; CHECK-NEXT: add.w r12, r12, r2, lsl #2 +; CHECK-NEXT: add.w r11, r11, r7, lsl #2 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: .LBB15_10: @ %while.end55 -; CHECK-NEXT: ands r1, r9, #3 +; CHECK-NEXT: ands r1, r4, #3 ; CHECK-NEXT: beq .LBB15_12 ; CHECK-NEXT: @ %bb.11: @ %if.then59 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r12] +; CHECK-NEXT: vstrwt.32 q0, [r11] ; CHECK-NEXT: .LBB15_12: @ %if.end61 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 %0 = load float*, float** %pState1, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll @@ -183,24 +183,23 @@ ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: vneg.f32 s2, s0 ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vdup.32 q0, r1 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: eor r1, r1, #-2147483648 -; CHECK-NEXT: vdup.32 q1, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vpt.f32 le, q0, q3 -; CHECK-NEXT: vcmpt.f32 le, q3, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpt.f32 ge, q1, r2 +; CHECK-NEXT: vcmpt.f32 le, q1, r1 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r0], #16 +; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -247,22 +246,20 @@ ; CHECK-NEXT: mvn r2, #7 ; CHECK-NEXT: add.w r1, r2, r1, lsl #3 ; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vneg.f16 s2, s0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov.f16 r1, s0 -; CHECK-NEXT: vneg.f16 s0, s0 -; CHECK-NEXT: vdup.16 q1, r1 +; CHECK-NEXT: vmov.f16 r1, s2 ; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q3, [r0] -; CHECK-NEXT: vpt.f16 le, q1, q3 -; CHECK-NEXT: vcmpt.f16 le, q3, q0 +; CHECK-NEXT: vldrh.u16 q1, [r0] +; CHECK-NEXT: vpt.f16 ge, q1, r2 +; CHECK-NEXT: vcmpt.f16 le, q1, r1 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q2, [r0], #16 +; CHECK-NEXT: vstrht.16 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -485,24 +482,23 @@ ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: vneg.f32 s2, s0 ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vdup.32 q0, r1 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: eor r1, r1, #-2147483648 -; CHECK-NEXT: vdup.32 q1, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vpt.f32 le, q0, q3 -; CHECK-NEXT: vcmpt.f32 le, q3, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpt.f32 ge, q1, r2 +; CHECK-NEXT: vcmpt.f32 le, q1, r1 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r0], #16 +; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -549,22 +545,20 @@ ; CHECK-NEXT: mvn r2, #7 ; CHECK-NEXT: add.w r1, r2, r1, lsl #3 ; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vneg.f16 s2, s0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov.f16 r1, s0 -; CHECK-NEXT: vneg.f16 s0, s0 -; CHECK-NEXT: vdup.16 q1, r1 +; CHECK-NEXT: vmov.f16 r1, s2 ; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q3, [r0] -; CHECK-NEXT: vpt.f16 le, q1, q3 -; CHECK-NEXT: vcmpt.f16 le, q3, q0 +; CHECK-NEXT: vldrh.u16 q1, [r0] +; CHECK-NEXT: vpt.f16 ge, q1, r2 +; CHECK-NEXT: vcmpt.f16 le, q1, r1 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q2, [r0], #16 +; CHECK-NEXT: vstrht.16 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc}