Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15596,6 +15596,12 @@ auto *Sub = cast(*I->users().begin()); return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; }; + auto IsFMS = [&](Instruction *I) { + if (match(I->getOperand(0), m_FNeg(m_Value())) || + match(I->getOperand(1), m_FNeg(m_Value()))) + return true; + return false; + }; auto IsSinker = [&](Instruction *I, int Operand) { switch (I->getOpcode()) { @@ -15613,31 +15619,45 @@ case Instruction::LShr: case Instruction::AShr: return Operand == 1; + case Instruction::Call: + if (auto *II = dyn_cast(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::fma: + return !IsFMS(I); + default: + return false; + } + } + return false; default: return false; } }; - int Op = 0; - if (!isa(I->getOperand(Op))) - Op = 1; - if (!IsSinker(I, Op)) - return false; - if (!match(I->getOperand(Op), - m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), - m_Undef(), m_ZeroMask()))) { - return false; - } - Instruction *Shuffle = cast(I->getOperand(Op)); - // All uses of the shuffle should be sunk to avoid duplicating it across gpr - // and vector registers - for (Use &U : Shuffle->uses()) { - Instruction *Insn = cast(U.getUser()); - if (!IsSinker(Insn, U.getOperandNo())) - return false; + for (auto OpIdx : enumerate(I->operands())) { + Value *Op = OpIdx.value().get(); + // Make sure we are not already sinking this operand + if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + // We are looking for a splat that can be sunk. + if (!match(Op, m_ShuffleVector( + m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), + m_Undef(), m_ZeroMask()))) + continue; + if (!IsSinker(I, OpIdx.index())) + continue; + + Instruction *Shuffle = cast(Op); + // All uses of the shuffle should be sunk to avoid duplicating it across gpr + // and vector registers + for (Use &U : Shuffle->uses()) { + Instruction *Insn = cast(U.getUser()); + if (!IsSinker(Insn, U.getOperandNo())) + return false; + } + Ops.push_back(&Shuffle->getOperandUse(0)); + Ops.push_back(&OpIdx.value()); } - Ops.push_back(&Shuffle->getOperandUse(0)); - Ops.push_back(&I->getOperandUse(Op)); return true; } Index: llvm/test/CodeGen/Thumb2/mve-float32regloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1414,8 +1414,10 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biquad_cascade_stereo_df2T_instance_f32* nocapture readonly %0, float* %1, float* %2, i32 %3) { ; CHECK-LABEL: arm_biquad_cascade_stereo_df2T_f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: .pad #24 @@ -1432,38 +1434,38 @@ ; CHECK-NEXT: mov r4, sp ; CHECK-NEXT: .LBB17_2: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB17_3 Depth 2 -; CHECK-NEXT: vldr s4, [r0, #4] -; CHECK-NEXT: vldrw.u32 q3, [r12] -; CHECK-NEXT: vldr s8, [r0, #12] -; CHECK-NEXT: mov r6, lr -; CHECK-NEXT: vldr s6, [r0, #8] -; CHECK-NEXT: dls lr, r3 -; CHECK-NEXT: vldr s10, [r0, #16] -; CHECK-NEXT: vmov.f32 s5, s4 +; CHECK-NEXT: vldr s8, [r0, #4] +; CHECK-NEXT: vldrw.u32 q4, [r12] +; CHECK-NEXT: vldr s12, [r0, #12] +; CHECK-NEXT: mov r7, lr +; CHECK-NEXT: vldr s10, [r0, #8] +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: vldr s14, [r0, #16] ; CHECK-NEXT: vmov.f32 s9, s8 -; CHECK-NEXT: ldr r5, [r0] -; CHECK-NEXT: vmov.f32 s7, s6 -; CHECK-NEXT: vstrw.32 q3, [r4] +; CHECK-NEXT: vmov.f32 s13, s12 +; CHECK-NEXT: vldr s4, [r0] +; CHECK-NEXT: vmov.f32 s15, s14 +; CHECK-NEXT: vstrw.32 q4, [r4] ; CHECK-NEXT: vmov.f32 s11, s10 -; CHECK-NEXT: vdup.32 q3, r5 -; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB17_3: @ Parent Loop BB17_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q5, [r1, q0, uxtw #2] ; CHECK-NEXT: vldrw.u32 q6, [r4, q0, uxtw #2] +; CHECK-NEXT: vmov r6, s4 ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vfma.f32 q6, q5, q3 +; CHECK-NEXT: vfma.f32 q6, q5, r6 ; CHECK-NEXT: vstmia r5, {s24, s25} ; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vldrw.u32 q4, [sp, #8] -; CHECK-NEXT: vfma.f32 q4, q6, q2 -; CHECK-NEXT: vfma.f32 q4, q5, q1 +; CHECK-NEXT: vfma.f32 q4, q6, q3 +; CHECK-NEXT: vfma.f32 q4, q5, q2 ; CHECK-NEXT: vstrw.32 q4, [r4] ; CHECK-NEXT: le lr, .LBB17_3 ; CHECK-NEXT: @ %bb.4: @ in Loop: Header=BB17_2 Depth=1 -; CHECK-NEXT: mov lr, r6 +; CHECK-NEXT: mov lr, r7 ; CHECK-NEXT: adds r0, #20 -; CHECK-NEXT: subs.w lr, r6, #1 +; CHECK-NEXT: subs.w lr, r7, #1 ; CHECK-NEXT: vstrb.8 q4, [r12], #16 ; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: bne .LBB17_2 @@ -1477,7 +1479,8 @@ ; CHECK-NEXT: .LBB17_7: ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} %5 = alloca [6 x float], align 4 %6 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 1 %7 = load float*, float** %6, align 4 Index: llvm/test/CodeGen/Thumb2/mve-fma-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-fma-loops.ll +++ llvm/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -10,15 +10,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vstrw.32 q3, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vfmas.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -127,14 +125,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vfma.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -242,17 +239,15 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vneg.f32 s0, s0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: eor r12, r12, #-2147483648 -; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vstrw.32 q3, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vfmas.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -484,16 +479,15 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vneg.f32 s0, s0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: eor r12, r12, #-2147483648 -; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vfma.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -604,15 +598,14 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vneg.f32 q1, q1 -; CHECK-NEXT: vfma.f32 q1, q2, q0 -; CHECK-NEXT: vstrw.32 q1, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: vfma.f32 q0, q1, r12 +; CHECK-NEXT: vstrw.32 q0, [r2], #16 ; CHECK-NEXT: letp lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} Index: llvm/test/CodeGen/Thumb2/mve-vldst4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -4,234 +4,232 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 { ; CHECK-LABEL: vldst4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #104 -; CHECK-NEXT: sub sp, #104 +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 ; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: cmp.w r3, r2, lsr #2 ; CHECK-NEXT: beq.w .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: vldr.16 s0, [sp, #160] ; CHECK-NEXT: mvn r3, #7 ; CHECK-NEXT: and.w r2, r3, r2, lsr #2 -; CHECK-NEXT: vldr.16 s0, [sp, #176] -; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: vmov.f16 r12, s0 +; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: add.w lr, r3, r2, lsr #3 -; CHECK-NEXT: vmov.f16 r2, s0 -; CHECK-NEXT: vdup.16 q0, r2 -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q6, [r0, #32] -; CHECK-NEXT: vldrh.u16 q4, [r0, #48] -; CHECK-NEXT: vldrh.u16 q0, [r0], #64 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmovx.f16 s12, s16 -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrh.u16 q7, [r0, #-48] -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.16 q2[1], r3 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov r2, s30 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmul.f16 q5, q2, q1 -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s8, s28 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmovx.f16 s8, s26 +; CHECK-NEXT: vldrh.u16 q4, [r0, #32] +; CHECK-NEXT: vldrh.u16 q3, [r0, #48] +; CHECK-NEXT: vldrh.u16 q7, [r0], #64 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmovx.f16 s8, s13 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov r3, s29 +; CHECK-NEXT: vldrh.u16 q5, [r0, #-48] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov r2, s31 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov r2, s21 ; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s24 -; CHECK-NEXT: vmov r12, s23 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmov.16 q2[4], r3 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vmov.16 q2[6], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmovx.f16 s12, s30 -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r2, s23 ; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vstrw.32 q5, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.16 q2[0], r12 -; CHECK-NEXT: vmul.f16 q1, q1, q3 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s4, s23 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmul.f16 q0, q1, r12 +; CHECK-NEXT: vmovx.f16 s4, s21 +; CHECK-NEXT: vmov q6, q0 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s31 ; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r2, s29 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov r2, s19 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: vmov.16 q2[3], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmovx.f16 s4, s29 -; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmovx.f16 s0, s29 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s27 +; CHECK-NEXT: vmovx.f16 s4, s19 ; CHECK-NEXT: vmov.16 q0[2], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s25 -; CHECK-NEXT: vmul.f16 q5, q2, q3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmovx.f16 s8, s17 -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov r12, s20 +; CHECK-NEXT: vmovx.f16 s4, s17 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov.16 q1[4], r4 ; CHECK-NEXT: vmov.16 q1[5], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s19 +; CHECK-NEXT: vmovx.f16 s8, s15 ; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s31 +; CHECK-NEXT: vmovx.f16 s8, s23 ; CHECK-NEXT: vmov.16 q1[7], r2 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.16 q1[2], r12 -; CHECK-NEXT: vmul.f16 q4, q0, q3 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmovx.f16 s20, s23 +; CHECK-NEXT: vmov.16 q1[2], r3 +; CHECK-NEXT: vmul.f16 q0, q0, r12 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s25 ; CHECK-NEXT: vmov.16 q1[3], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 +; CHECK-NEXT: vmovx.f16 s0, s9 ; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s16, s19 +; CHECK-NEXT: vmovx.f16 s8, s12 ; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov r3, s28 ; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s28 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmul.f16 q6, q1, r12 +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: vmovx.f16 s4, s20 +; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmovx.f16 s4, s18 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmovx.f16 s4, s16 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[4], r4 +; CHECK-NEXT: vmov r3, s25 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s8, s22 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.16 q5[0], r3 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmul.f16 q2, q0, r12 +; CHECK-NEXT: vmovx.f16 s0, s25 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q5[1], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q6[1], r3 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov.16 q5[4], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov.16 q5[5], r2 +; CHECK-NEXT: vmov r2, s19 ; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov r3, s7 ; CHECK-NEXT: vmov.16 q3[3], r3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmovx.f16 s0, s7 ; CHECK-NEXT: vmov.16 q3[6], r2 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s27 ; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov r2, s27 ; CHECK-NEXT: vmov.16 q7[0], r2 -; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov r3, s11 ; CHECK-NEXT: vmov.16 q7[1], r3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmovx.f16 s0, s11 ; CHECK-NEXT: vmov.16 q7[4], r2 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q7[5], r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmovx.f16 s0, s16 ; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmovx.f16 s16, s18 ; CHECK-NEXT: vmov.16 q3[3], r3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmovx.f16 s0, s4 ; CHECK-NEXT: vmov.16 q3[6], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s0, s24 ; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.16 q1[1], r3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s24, s26 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov r2, s26 ; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmovx.f16 s16, s11 -; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s16, s10 ; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov.f32 s1, s13 -; CHECK-NEXT: vmov.f32 s29, s9 -; CHECK-NEXT: vmov.f32 s31, s11 -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmov.f32 s25, s21 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s3, s15 -; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.16 q6[3], r3 +; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmovx.f16 s16, s10 +; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vmov.f32 s1, s25 +; CHECK-NEXT: vmov.f32 s3, s27 ; CHECK-NEXT: vstrh.16 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s27, s23 -; CHECK-NEXT: vstrh.16 q2, [r1, #48] -; CHECK-NEXT: vstrh.16 q6, [r1], #64 -; CHECK-NEXT: vstrh.16 q7, [r1, #-48] +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vmov.f32 s29, s1 +; CHECK-NEXT: vmov.f32 s31, s3 +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vstrh.16 q7, [r1, #48] +; CHECK-NEXT: vstrh.16 q1, [r1], #64 +; CHECK-NEXT: vmov.f32 s21, s1 +; CHECK-NEXT: vmov.f32 s23, s3 +; CHECK-NEXT: vstrh.16 q5, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: add sp, #104 +; CHECK-NEXT: add sp, #88 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 %l0 = bitcast i16 %tmp.0.extract.trunc to half