diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -389,7 +389,13 @@ return useMachineScheduler(); } -bool ARMSubtarget::enableSubRegLiveness() const { return EnableSubRegLiveness; } +bool ARMSubtarget::enableSubRegLiveness() const { + if (EnableSubRegLiveness.getNumOccurrences()) + return EnableSubRegLiveness; + // Enable SubRegLiveness for MVE to better optimize s subregs for mqpr regs + // and q subregs for qqqqpr regs. + return hasMVEIntegerOps(); +} // This overrides the PostRAScheduler bit in the SchedModel for any CPU. bool ARMSubtarget::enablePostRAScheduler() const { diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -227,11 +227,9 @@ ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vadd.f32 q0, q0, r0 -; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: vldr s0, .LCPI1_0 -; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.5: @@ -280,7 +278,7 @@ ; CHECK-LABEL: fast_float_half_mac: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq.w .LBB2_20 @@ -303,13 +301,13 @@ ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vmul.f16 q5, q6, q5 ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vcvtt.f32.f16 s27, s21 +; CHECK-NEXT: vcvtt.f32.f16 s23, s21 +; CHECK-NEXT: vcvtb.f32.f16 s22, s21 +; CHECK-NEXT: vcvtt.f32.f16 s21, s20 +; CHECK-NEXT: vcvtb.f32.f16 s20, s20 ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vcvtb.f32.f16 s26, s21 ; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: vcvtt.f32.f16 s25, s20 -; CHECK-NEXT: vcvtb.f32.f16 s24, s20 -; CHECK-NEXT: vadd.f32 q5, q3, q6 +; CHECK-NEXT: vadd.f32 q5, q3, q5 ; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: bne .LBB2_3 ; CHECK-NEXT: b .LBB2_19 @@ -349,8 +347,8 @@ ; CHECK-NEXT: bpl .LBB2_8 ; CHECK-NEXT: .LBB2_7: @ %cond.load12 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vldr.16 s24, [r0, #6] -; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vldr.16 s22, [r0, #6] +; CHECK-NEXT: vins.f16 s21, s22 ; CHECK-NEXT: .LBB2_8: @ %else13 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vcmp.u32 cs, q2, q4 @@ -391,15 +389,15 @@ ; CHECK-NEXT: bpl .LBB2_5 ; CHECK-NEXT: .LBB2_13: @ %cond.load6 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vldr.16 s24, [r0, #2] -; CHECK-NEXT: vins.f16 s20, s24 +; CHECK-NEXT: vldr.16 s22, [r0, #2] +; CHECK-NEXT: vins.f16 s20, s22 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bpl .LBB2_6 ; CHECK-NEXT: .LBB2_14: @ %cond.load9 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vmovx.f16 s24, s21 ; CHECK-NEXT: vldr.16 s21, [r0, #4] -; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vmovx.f16 s22, s0 +; CHECK-NEXT: vins.f16 s21, s22 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bmi .LBB2_7 ; CHECK-NEXT: b .LBB2_8 @@ -410,21 +408,21 @@ ; CHECK-NEXT: bpl .LBB2_10 ; CHECK-NEXT: .LBB2_16: @ %cond.load19 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vldr.16 s28, [r1, #2] -; CHECK-NEXT: vins.f16 s24, s28 +; CHECK-NEXT: vldr.16 s26, [r1, #2] +; CHECK-NEXT: vins.f16 s24, s26 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bpl .LBB2_11 ; CHECK-NEXT: .LBB2_17: @ %cond.load22 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vmovx.f16 s28, s25 ; CHECK-NEXT: vldr.16 s25, [r1, #4] -; CHECK-NEXT: vins.f16 s25, s28 +; CHECK-NEXT: vmovx.f16 s26, s0 +; CHECK-NEXT: vins.f16 s25, s26 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bpl.w .LBB2_2 ; CHECK-NEXT: .LBB2_18: @ %cond.load25 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vldr.16 s28, [r1, #6] -; CHECK-NEXT: vins.f16 s25, s28 +; CHECK-NEXT: vldr.16 s26, [r1, #6] +; CHECK-NEXT: vins.f16 s25, s26 ; CHECK-NEXT: b .LBB2_2 ; CHECK-NEXT: .LBB2_19: @ %middle.block ; CHECK-NEXT: vdup.32 q0, r12 @@ -439,9 +437,8 @@ ; CHECK-NEXT: .LBB2_20: ; CHECK-NEXT: vldr s0, .LCPI2_0 ; CHECK-NEXT: .LBB2_21: @ %for.cond.cleanup -; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.22: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll @@ -14,8 +14,8 @@ ; CHECK-NEXT: vmvn.i32 q1, #0x1f ; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vadd.i32 q1, q3, q1 ; CHECK-NEXT: subs r3, r1, #1 +; CHECK-NEXT: vadd.i32 q1, q3, q1 ; CHECK-NEXT: vidup.u32 q2, r2, #8 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vadd.i32 q1, q2, r0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -15,10 +15,10 @@ ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit -; CHECK-NEXT: vmov s4, r1 -; CHECK-NEXT: vadd.f32 s0, s3, s3 -; CHECK-NEXT: vcvt.f32.u32 s4, s4 -; CHECK-NEXT: vdiv.f32 s0, s0, s4 +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vadd.f32 s2, s3, s3 +; CHECK-NEXT: vcvt.f32.u32 s0, s0 +; CHECK-NEXT: vdiv.f32 s0, s2, s0 ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.32 lr, r1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -986,11 +986,11 @@ ; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vmul.f16 q0, q0, q1 -; CHECK-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-NEXT: vcvtb.f32.f16 s4, s0 -; CHECK-NEXT: vstrb.8 q1, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB5_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1100,11 +1100,11 @@ ; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-NEXT: vcvtb.f32.f16 s4, s0 -; CHECK-NEXT: vstrb.8 q1, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB6_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1214,11 +1214,11 @@ ; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vsub.f16 q0, q0, q1 -; CHECK-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-NEXT: vcvtb.f32.f16 s4, s0 -; CHECK-NEXT: vstrb.8 q1, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB7_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1333,11 +1333,11 @@ ; CHECK-NEXT: vmov.16 q0[3], r8 ; CHECK-NEXT: vcvt.f16.s16 q0, q0 ; CHECK-NEXT: vmul.f16 q0, q1, q0 -; CHECK-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-NEXT: vcvtb.f32.f16 s4, s0 -; CHECK-NEXT: vstrb.8 q1, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB8_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll @@ -240,11 +240,11 @@ ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: vrintr.f32 s7, s3 -; CHECK-NEXT: vrintr.f32 s6, s2 -; CHECK-NEXT: vrintr.f32 s5, s1 -; CHECK-NEXT: vrintr.f32 s4, s0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vrintr.f32 s3, s3 +; CHECK-NEXT: vrintr.f32 s2, s2 +; CHECK-NEXT: vrintr.f32 s1, s1 +; CHECK-NEXT: vrintr.f32 s0, s0 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll @@ -11,9 +11,9 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] ; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! -; CHECK-NEXT: vmulh.s16 q2, q1, q1 +; CHECK-NEXT: vmulh.s16 q1, q1, q1 ; CHECK-NEXT: vmulh.s16 q0, q0, q0 -; CHECK-NEXT: vqadd.s16 q0, q0, q2 +; CHECK-NEXT: vqadd.s16 q0, q0, q1 ; CHECK-NEXT: vshr.s16 q0, q0, #1 ; CHECK-NEXT: vstrh.16 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB0_1 diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -14,9 +14,9 @@ ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst ; CHECK-NEXT: vcmpt.u32 hi, q1, q0 -; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vldr d1, [sp] ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 @@ -156,8 +156,8 @@ ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst ; CHECK-NEXT: vcmpt.i16 ne, q0, zr -; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vldr d1, [sp, #48] +; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 @@ -339,12 +339,12 @@ ; CHECK-NEXT: vmov.8 q3[14], r0 ; CHECK-NEXT: vmov.u16 r0, q0[7] ; CHECK-NEXT: vmov.8 q3[15], r0 -; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: add r0, sp, #88 ; CHECK-NEXT: vcmp.i8 ne, q3, zr ; CHECK-NEXT: vldr d1, [sp, #80] ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpnot +; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vpst ; CHECK-NEXT: vcmpt.i8 ne, q2, zr ; CHECK-NEXT: vpsel q0, q0, q1 diff --git a/llvm/test/CodeGen/Thumb2/mve-be.ll b/llvm/test/CodeGen/Thumb2/mve-be.ll --- a/llvm/test/CodeGen/Thumb2/mve-be.ll +++ b/llvm/test/CodeGen/Thumb2/mve-be.ll @@ -70,10 +70,10 @@ define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LE-LABEL: add_soft: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vmov d0, r0, r1 ; CHECK-LE-NEXT: mov r0, sp ; CHECK-LE-NEXT: vldrw.u32 q1, [r0] +; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vadd.i32 q0, q0, q1 ; CHECK-LE-NEXT: vmov r0, r1, d0 ; CHECK-LE-NEXT: vmov r2, r3, d1 @@ -81,9 +81,9 @@ ; ; CHECK-BE-LABEL: add_soft: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0] ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 diff --git a/llvm/test/CodeGen/Thumb2/mve-ctlz.ll b/llvm/test/CodeGen/Thumb2/mve-ctlz.ll --- a/llvm/test/CodeGen/Thumb2/mve-ctlz.ll +++ b/llvm/test/CodeGen/Thumb2/mve-ctlz.ll @@ -12,8 +12,10 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r0, r1 -; CHECK-NEXT: vmov s6, r0 +; CHECK-NEXT: vmov s2, r0 ; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vldr s1, .LCPI0_0 +; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: cset r2, ne @@ -21,10 +23,7 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r0, r1 -; CHECK-NEXT: vmov s4, r0 -; CHECK-NEXT: vldr s5, .LCPI0_0 -; CHECK-NEXT: vmov.f32 s7, s5 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: @@ -76,8 +75,10 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r0, r1 -; CHECK-NEXT: vmov s6, r0 +; CHECK-NEXT: vmov s2, r0 ; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vldr s1, .LCPI4_0 +; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: cset r2, ne @@ -85,10 +86,7 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r0, r1 -; CHECK-NEXT: vmov s4, r0 -; CHECK-NEXT: vldr s5, .LCPI4_0 -; CHECK-NEXT: vmov.f32 s7, s5 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll --- a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll @@ -12,6 +12,7 @@ ; CHECK-NEXT: vmov r3, r4, d0 ; CHECK-NEXT: mov.w r12, #858993459 ; CHECK-NEXT: vldr s1, .LCPI0_0 +; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: and.w r0, lr, r2, lsr #1 ; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: and.w r2, r12, r0, lsr #2 @@ -51,7 +52,6 @@ ; CHECK-NEXT: vmov s2, r1 ; CHECK-NEXT: add.w r0, r2, r0, lsr #24 ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/Thumb2/mve-cttz.ll b/llvm/test/CodeGen/Thumb2/mve-cttz.ll --- a/llvm/test/CodeGen/Thumb2/mve-cttz.ll +++ b/llvm/test/CodeGen/Thumb2/mve-cttz.ll @@ -4,8 +4,7 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_0_t(<2 x i64> %src){ ; CHECK-LABEL: cttz_2i64_0_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: clz r1, r1 @@ -16,7 +15,9 @@ ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r1, r0 ; CHECK-NEXT: vmov s2, r1 -; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vldr s1, .LCPI0_0 +; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: clz r1, r1 @@ -27,8 +28,6 @@ ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r1, r0 ; CHECK-NEXT: vmov s0, r1 -; CHECK-NEXT: vldr s1, .LCPI0_0 -; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: @@ -78,8 +77,7 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_1_t(<2 x i64> %src){ ; CHECK-LABEL: cttz_2i64_1_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: clz r1, r1 @@ -90,7 +88,9 @@ ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r1, r0 ; CHECK-NEXT: vmov s2, r1 -; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vldr s1, .LCPI4_0 +; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: clz r1, r1 @@ -101,8 +101,6 @@ ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r1, r0 ; CHECK-NEXT: vmov s0, r1 -; CHECK-NEXT: vldr s1, .LCPI4_0 -; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll --- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll +++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -724,11 +724,10 @@ define arm_aapcs_vfpcc <4 x float> @fdiv_f32(<4 x float> %in1, <4 x float> %in2) { ; CHECK-LABEL: fdiv_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdiv.f32 s11, s3, s7 -; CHECK-NEXT: vdiv.f32 s10, s2, s6 -; CHECK-NEXT: vdiv.f32 s9, s1, s5 -; CHECK-NEXT: vdiv.f32 s8, s0, s4 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vdiv.f32 s3, s3, s7 +; CHECK-NEXT: vdiv.f32 s2, s2, s6 +; CHECK-NEXT: vdiv.f32 s1, s1, s5 +; CHECK-NEXT: vdiv.f32 s0, s0, s4 ; CHECK-NEXT: bx lr entry: %out = fdiv <4 x float> %in1, %in2 @@ -774,27 +773,26 @@ define arm_aapcs_vfpcc <8 x half> @fdiv_f16(<8 x half> %in1, <8 x half> %in2) { ; CHECK-LABEL: fdiv_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmovx.f16 s2, s8 -; CHECK-NEXT: vmovx.f16 s14, s9 -; CHECK-NEXT: vdiv.f16 s12, s2, s0 -; CHECK-NEXT: vdiv.f16 s0, s8, s4 -; CHECK-NEXT: vins.f16 s0, s12 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vdiv.f16 s12, s14, s12 -; CHECK-NEXT: vdiv.f16 s1, s9, s5 -; CHECK-NEXT: vins.f16 s1, s12 -; CHECK-NEXT: vmovx.f16 s12, s6 -; CHECK-NEXT: vmovx.f16 s14, s10 -; CHECK-NEXT: vdiv.f16 s2, s10, s6 -; CHECK-NEXT: vdiv.f16 s12, s14, s12 -; CHECK-NEXT: vmovx.f16 s14, s11 -; CHECK-NEXT: vins.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vdiv.f16 s12, s14, s12 -; CHECK-NEXT: vdiv.f16 s3, s11, s7 -; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vmovx.f16 s10, s0 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vdiv.f16 s0, s0, s4 +; CHECK-NEXT: vdiv.f16 s8, s10, s8 +; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vdiv.f16 s1, s1, s5 +; CHECK-NEXT: vdiv.f16 s4, s8, s4 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vdiv.f16 s2, s2, s6 +; CHECK-NEXT: vdiv.f16 s4, s8, s4 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s6, s3 +; CHECK-NEXT: vmovx.f16 s4, s7 +; CHECK-NEXT: vdiv.f16 s3, s3, s7 +; CHECK-NEXT: vdiv.f16 s4, s6, s4 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %out = fdiv <8 x half> %in1, %in2 @@ -806,8 +804,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q5, q0 ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vcvtb.f32.f16 s0, s20 @@ -816,59 +814,59 @@ ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vcvtt.f32.f16 s0, s20 -; CHECK-NEXT: vmov s24, r0 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s24, s24 -; CHECK-NEXT: vcvtt.f16.f32 s24, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s21 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s25, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s21 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s25, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s22 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s26, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s22 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s26, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s23 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s27, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s23 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s27, s0 -; CHECK-NEXT: vmov q0, q6 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} entry: %out = frem <8 x half> %in1, %in2 diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -1422,22 +1422,22 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: ldrd r12, r6, [r0, #4] -; CHECK-NEXT: and r8, r3, #1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vldr.16 s4, .LCPI17_0 +; CHECK-NEXT: and r8, r3, #1 +; CHECK-NEXT: vldr.16 s0, .LCPI17_0 ; CHECK-NEXT: lsr.w r9, r3, #1 -; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: b .LBB17_3 ; CHECK-NEXT: .LBB17_1: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 -; CHECK-NEXT: vstr.16 s8, [r12] -; CHECK-NEXT: vmovx.f16 s9, s8 +; CHECK-NEXT: vmovx.f16 s5, s4 +; CHECK-NEXT: vstr.16 s4, [r12] ; CHECK-NEXT: .LBB17_2: @ %if.end ; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 -; CHECK-NEXT: vstr.16 s9, [r12, #2] +; CHECK-NEXT: vstr.16 s5, [r12, #2] ; CHECK-NEXT: adds r6, #10 ; CHECK-NEXT: subs r0, #1 ; CHECK-NEXT: add.w r12, r12, #4 @@ -1446,15 +1446,15 @@ ; CHECK-NEXT: .LBB17_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB17_5 Depth 2 -; CHECK-NEXT: vldrh.u16 q3, [r6] +; CHECK-NEXT: vldrh.u16 q2, [r6] ; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vshlc q4, r5, #16 +; CHECK-NEXT: vldrh.u16 q3, [r6, #4] ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vshlc q5, r5, #16 -; CHECK-NEXT: vldrh.u16 q4, [r6, #4] -; CHECK-NEXT: vmov q6, q4 -; CHECK-NEXT: vshlc q6, r5, #16 -; CHECK-NEXT: vldrh.u16 q2, [r12] -; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vldrh.u16 q1, [r12] +; CHECK-NEXT: vmov.f32 s5, s1 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: wls lr, r9, .LBB17_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.preheader @@ -1464,19 +1464,19 @@ ; CHECK-NEXT: @ Parent Loop BB17_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrh r7, [r1], #4 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vfma.f16 q2, q3, r7 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vfma.f16 q1, q2, r7 ; CHECK-NEXT: ldrh r4, [r1, #-2] -; CHECK-NEXT: vmov.u16 r7, q2[0] -; CHECK-NEXT: vfma.f16 q2, q4, r7 -; CHECK-NEXT: vins.f16 s9, s4 -; CHECK-NEXT: vfma.f16 q2, q5, r4 -; CHECK-NEXT: vmov.u16 r4, q2[1] -; CHECK-NEXT: vfma.f16 q2, q6, r4 +; CHECK-NEXT: vmov.u16 r7, q1[0] +; CHECK-NEXT: vfma.f16 q1, q3, r7 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vfma.f16 q1, q4, r4 +; CHECK-NEXT: vmov.u16 r4, q1[1] +; CHECK-NEXT: vfma.f16 q1, q5, r4 ; CHECK-NEXT: strh r4, [r5, #2] -; CHECK-NEXT: vmov.f32 s8, s9 +; CHECK-NEXT: vmov.f32 s4, s5 ; CHECK-NEXT: strh r7, [r5], #4 -; CHECK-NEXT: vmov.16 q2[2], r3 +; CHECK-NEXT: vmov.16 q1[2], r3 ; CHECK-NEXT: le lr, .LBB17_5 ; CHECK-NEXT: .LBB17_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 @@ -1485,15 +1485,15 @@ ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vfma.f16 q2, q3, r1 -; CHECK-NEXT: vmov.u16 r1, q2[0] -; CHECK-NEXT: vfma.f16 q2, q4, r1 +; CHECK-NEXT: vfma.f16 q1, q2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[0] +; CHECK-NEXT: vfma.f16 q1, q3, r1 ; CHECK-NEXT: strh r1, [r5] -; CHECK-NEXT: vmovx.f16 s6, s8 -; CHECK-NEXT: vstr.16 s6, [r12] +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vstr.16 s2, [r12] ; CHECK-NEXT: b .LBB17_2 ; CHECK-NEXT: .LBB17_8: @ %do.end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} ; CHECK-NEXT: .p2align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1416,8 +1416,8 @@ ; CHECK-NEXT: @ Child Loop BB17_3 Depth 2 ; CHECK-NEXT: ldrd r5, r7, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: vldr s8, [r0, #8] ; CHECK-NEXT: ldr r6, [r0, #12] +; CHECK-NEXT: vldr s8, [r0, #8] ; CHECK-NEXT: vstrw.32 q1, [r4] ; CHECK-NEXT: vdup.32 q1, r7 ; CHECK-NEXT: vldr s12, [r0, #16] @@ -1647,8 +1647,8 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: ldrd r12, r10, [r0] ; CHECK-NEXT: @ implicit-def: $s2 ; CHECK-NEXT: and r7, r3, #3 @@ -1656,19 +1656,19 @@ ; CHECK-NEXT: lsrs r0, r3, #2 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r2, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: b .LBB19_3 ; CHECK-NEXT: .LBB19_1: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: vmov.f32 s0, s10 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s7, s6 ; CHECK-NEXT: .LBB19_2: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vstr s8, [r10] +; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: subs.w r12, r12, #1 -; CHECK-NEXT: vstr s0, [r10, #4] +; CHECK-NEXT: vstr s1, [r10] ; CHECK-NEXT: add.w r9, r9, #128 +; CHECK-NEXT: vstr s4, [r10, #4] ; CHECK-NEXT: vstr s14, [r10, #8] ; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: vstr s7, [r10, #12] @@ -1677,48 +1677,48 @@ ; CHECK-NEXT: .LBB19_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB19_5 Depth 2 -; CHECK-NEXT: vldr s7, [r10, #8] -; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: vldr s8, [r10] -; CHECK-NEXT: vldr s10, [r10, #4] +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: vldr s1, [r10] +; CHECK-NEXT: vldr s3, [r10, #4] +; CHECK-NEXT: vldr s7, [r10, #8] ; CHECK-NEXT: vldr s6, [r10, #12] ; CHECK-NEXT: wls lr, r0, .LBB19_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r5, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: .LBB19_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB19_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vldr s8, [r1, #12] -; CHECK-NEXT: vldrw.u32 q0, [r9, #112] -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vldr s10, [r1, #8] ; CHECK-NEXT: vmov r7, s7 +; CHECK-NEXT: vldrw.u32 q2, [r9, #16] ; CHECK-NEXT: vmov r11, s6 +; CHECK-NEXT: vldrw.u32 q1, [r9, #112] +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vldr s1, [r1, #12] +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vldr s3, [r1, #8] +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [r9] -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov r8, s8 -; CHECK-NEXT: vldrw.u32 q0, [r9, #16] +; CHECK-NEXT: vmov r8, s1 ; CHECK-NEXT: ldr r6, [r1, #4] ; CHECK-NEXT: vldrw.u32 q7, [r9, #32] ; CHECK-NEXT: vmul.f32 q1, q1, r8 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vldrw.u32 q3, [r9, #48] -; CHECK-NEXT: vfma.f32 q1, q0, r0 +; CHECK-NEXT: vfma.f32 q1, q2, r0 ; CHECK-NEXT: ldr r0, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q7, r6 ; CHECK-NEXT: vldrw.u32 q6, [r9, #64] +; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: vfma.f32 q1, q3, r0 ; CHECK-NEXT: vldrw.u32 q5, [r9, #80] ; CHECK-NEXT: vfma.f32 q1, q6, r4 ; CHECK-NEXT: vldrw.u32 q4, [r9, #96] +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vfma.f32 q1, q5, r3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vfma.f32 q1, q4, r7 -; CHECK-NEXT: vfma.f32 q1, q0, r11 -; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vfma.f32 q1, q2, r11 ; CHECK-NEXT: vstrb.8 q1, [r5], #16 ; CHECK-NEXT: le lr, .LBB19_5 ; CHECK-NEXT: .LBB19_6: @ %while.end @@ -1728,74 +1728,68 @@ ; CHECK-NEXT: beq .LBB19_1 ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vldr s24, [r1] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vldr s0, [r1, #4] -; CHECK-NEXT: vldrw.u32 q3, [r9] -; CHECK-NEXT: vldr s3, [r1, #12] -; CHECK-NEXT: vldrw.u32 q4, [r9, #32] -; CHECK-NEXT: vldr s1, [r1, #8] -; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: vldrw.u32 q2, [r9, #96] -; CHECK-NEXT: vmov r6, s3 +; CHECK-NEXT: vmov lr, s6 +; CHECK-NEXT: vldr s6, [r1, #12] +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vstrw.32 q2, [sp, #8] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r9, #112] +; CHECK-NEXT: vldr s1, [r1, #8] +; CHECK-NEXT: vldrw.u32 q3, [r9] +; CHECK-NEXT: vldr s4, [r1, #4] +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov r6, s6 +; CHECK-NEXT: vldrw.u32 q2, [r9, #16] +; CHECK-NEXT: vldr s0, [r1] ; CHECK-NEXT: vmul.f32 q3, q3, r6 ; CHECK-NEXT: vmov r6, s1 -; CHECK-NEXT: vstrw.32 q2, [sp, #24] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r9, #112] +; CHECK-NEXT: vldrw.u32 q4, [r9, #32] +; CHECK-NEXT: vfma.f32 q3, q2, r6 +; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: vldrw.u32 q5, [r9, #48] -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r9, #80] ; CHECK-NEXT: vldrw.u32 q7, [r9, #64] -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vstrw.32 q2, [sp, #8] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r9, #16] -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: cmp r7, #1 -; CHECK-NEXT: vfma.f32 q3, q2, r6 -; CHECK-NEXT: vldrw.u32 q2, [sp, #8] @ 16-byte Reload +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vfma.f32 q3, q4, r4 -; CHECK-NEXT: vmov lr, s6 ; CHECK-NEXT: vfma.f32 q3, q5, r3 +; CHECK-NEXT: vldrw.u32 q6, [r9, #80] +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vfma.f32 q3, q7, r0 -; CHECK-NEXT: vfma.f32 q3, q2, r1 -; CHECK-NEXT: vldrw.u32 q2, [sp, #24] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #8] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vfma.f32 q3, q6, r1 +; CHECK-NEXT: cmp r7, #1 ; CHECK-NEXT: vfma.f32 q3, q2, r2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vfma.f32 q3, q2, lr ; CHECK-NEXT: bne .LBB19_9 ; CHECK-NEXT: @ %bb.8: @ %if.then58 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: vstr s12, [r5] -; CHECK-NEXT: vmov.f32 s8, s24 -; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s1, s0 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s14, s12 -; CHECK-NEXT: b .LBB19_11 +; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_9: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: cmp r7, #2 ; CHECK-NEXT: vstmia r5, {s12, s13} -; CHECK-NEXT: bne .LBB19_12 +; CHECK-NEXT: bne .LBB19_11 ; CHECK-NEXT: @ %bb.10: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s4, s0 ; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vmov.f32 s0, s24 ; CHECK-NEXT: vmov.f32 s7, s12 -; CHECK-NEXT: .LBB19_11: @ %if.end69 -; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: b .LBB19_2 -; CHECK-NEXT: .LBB19_12: @ %if.else64 +; CHECK-NEXT: b .LBB19_12 +; CHECK-NEXT: .LBB19_11: @ %if.else64 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: vmov.f32 s7, s13 -; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vstr s14, [r5, #8] -; CHECK-NEXT: vmov.f32 s8, s1 +; CHECK-NEXT: .LBB19_12: @ %if.end69 +; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 +; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: b .LBB19_2 ; CHECK-NEXT: .LBB19_13: @ %do.end -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -2026,8 +2020,8 @@ ; CHECK-NEXT: b .LBB20_3 ; CHECK-NEXT: .LBB20_1: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vstr s4, [r12] ; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vstr s4, [r12] ; CHECK-NEXT: .LBB20_2: @ %if.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 ; CHECK-NEXT: vstr s6, [r12, #4] @@ -2209,9 +2203,9 @@ define arm_aapcs_vfpcc float @vecAddAcrossF32Mve(<4 x float> %in) { ; CHECK-LABEL: vecAddAcrossF32Mve: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f32 s4, s0, s1 -; CHECK-NEXT: vadd.f32 s4, s4, s2 -; CHECK-NEXT: vadd.f32 s0, s4, s3 +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s0, s0, s3 ; CHECK-NEXT: bx lr entry: %0 = extractelement <4 x float> %in, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-fmas.ll b/llvm/test/CodeGen/Thumb2/mve-fmas.ll --- a/llvm/test/CodeGen/Thumb2/mve-fmas.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmas.ll @@ -22,25 +22,25 @@ ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: vmla.f16 s0, s4, s8 ; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vins.f16 s0, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s1 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vmla.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmla.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vins.f16 s1, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s2 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vmla.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmla.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vins.f16 s2, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s3 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: vmla.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vins.f16 s3, s13 +; CHECK-MVE-NEXT: vmla.f16 s8, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s13 +; CHECK-MVE-NEXT: vins.f16 s2, s12 +; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -67,25 +67,25 @@ ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: vmla.f16 s0, s4, s8 ; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vins.f16 s0, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s1 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vmla.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmla.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vins.f16 s1, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s2 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vmla.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmla.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vins.f16 s2, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s3 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: vmla.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vins.f16 s3, s13 +; CHECK-MVE-NEXT: vmla.f16 s8, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s13 +; CHECK-MVE-NEXT: vins.f16 s2, s12 +; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -112,25 +112,25 @@ ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: vmls.f16 s0, s4, s8 ; CHECK-MVE-NEXT: vmls.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vins.f16 s0, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s1 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vmls.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmls.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vmls.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vins.f16 s1, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s2 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vmls.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmls.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vmls.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vins.f16 s2, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s3 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: vmls.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vmls.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vins.f16 s3, s13 +; CHECK-MVE-NEXT: vmls.f16 s8, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s13 +; CHECK-MVE-NEXT: vins.f16 s2, s12 +; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -161,22 +161,22 @@ ; CHECK-MVE-NEXT: vmovx.f16 s10, s4 ; CHECK-MVE-NEXT: vmla.f16 s0, s4, s8 ; CHECK-MVE-NEXT: vmla.f16 s12, s10, s8 -; CHECK-MVE-NEXT: vmovx.f16 s10, s5 -; CHECK-MVE-NEXT: vins.f16 s0, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s1 -; CHECK-MVE-NEXT: vmla.f16 s12, s10, s8 +; CHECK-MVE-NEXT: vmovx.f16 s10, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmla.f16 s1, s5, s8 -; CHECK-MVE-NEXT: vins.f16 s1, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s2 -; CHECK-MVE-NEXT: vmovx.f16 s10, s6 +; CHECK-MVE-NEXT: vmla.f16 s10, s4, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s10 +; CHECK-MVE-NEXT: vmovx.f16 s10, s2 +; CHECK-MVE-NEXT: vmla.f16 s10, s4, s8 ; CHECK-MVE-NEXT: vmla.f16 s2, s6, s8 -; CHECK-MVE-NEXT: vmla.f16 s12, s10, s8 -; CHECK-MVE-NEXT: vmovx.f16 s10, s7 -; CHECK-MVE-NEXT: vins.f16 s2, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s3 -; CHECK-MVE-NEXT: vmla.f16 s12, s10, s8 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vmla.f16 s6, s4, s8 ; CHECK-MVE-NEXT: vmla.f16 s3, s7, s8 -; CHECK-MVE-NEXT: vins.f16 s3, s12 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s2, s10 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr entry: %src3 = fptrunc float %src3o to half @@ -207,33 +207,33 @@ ; CHECK-MVE: @ %bb.0: @ %entry ; CHECK-MVE-NEXT: vmov q3, q0 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s2, s12 ; CHECK-MVE-NEXT: vmov.f32 s8, s3 -; CHECK-MVE-NEXT: vmovx.f16 s10, s4 +; CHECK-MVE-NEXT: vmla.f16 s8, s2, s0 ; CHECK-MVE-NEXT: vmov.f32 s0, s3 -; CHECK-MVE-NEXT: vmovx.f16 s9, s12 -; CHECK-MVE-NEXT: vmla.f16 s8, s9, s10 ; CHECK-MVE-NEXT: vmla.f16 s0, s12, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmov.f32 s9, s3 ; CHECK-MVE-NEXT: vmov.f32 s1, s3 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vmovx.f16 s10, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s8 +; CHECK-MVE-NEXT: vmovx.f16 s2, s5 +; CHECK-MVE-NEXT: vmovx.f16 s4, s13 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 +; CHECK-MVE-NEXT: vmla.f16 s8, s4, s2 ; CHECK-MVE-NEXT: vmla.f16 s1, s13, s5 -; CHECK-MVE-NEXT: vmla.f16 s9, s10, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vins.f16 s1, s9 -; CHECK-MVE-NEXT: vmov.f32 s9, s3 +; CHECK-MVE-NEXT: vins.f16 s1, s8 +; CHECK-MVE-NEXT: vmovx.f16 s2, s6 +; CHECK-MVE-NEXT: vmovx.f16 s4, s14 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 +; CHECK-MVE-NEXT: vmla.f16 s8, s4, s2 ; CHECK-MVE-NEXT: vmov.f32 s2, s3 -; CHECK-MVE-NEXT: vmovx.f16 s10, s14 -; CHECK-MVE-NEXT: vmla.f16 s9, s10, s8 ; CHECK-MVE-NEXT: vmla.f16 s2, s14, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s9 -; CHECK-MVE-NEXT: vmov.f32 s9, s3 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vmovx.f16 s10, s15 -; CHECK-MVE-NEXT: vmla.f16 s9, s10, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vins.f16 s2, s8 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: vmla.f16 s3, s15, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s9 +; CHECK-MVE-NEXT: vmla.f16 s8, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr entry: %src3 = fptrunc float %src3o to half @@ -364,14 +364,13 @@ ; ; CHECK-MVE-LABEL: vfmas32: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: @ kill: def $s8 killed $s8 def $q2 ; CHECK-MVE-NEXT: vmov.f32 s11, s8 -; CHECK-MVE-NEXT: vmla.f32 s11, s3, s7 ; CHECK-MVE-NEXT: vmov.f32 s10, s8 -; CHECK-MVE-NEXT: vmla.f32 s10, s2, s6 ; CHECK-MVE-NEXT: vmov.f32 s9, s8 -; CHECK-MVE-NEXT: vmla.f32 s9, s1, s5 ; CHECK-MVE-NEXT: vmla.f32 s8, s0, s4 +; CHECK-MVE-NEXT: vmla.f32 s11, s3, s7 +; CHECK-MVE-NEXT: vmla.f32 s10, s2, s6 +; CHECK-MVE-NEXT: vmla.f32 s9, s1, s5 ; CHECK-MVE-NEXT: vmov q0, q2 ; CHECK-MVE-NEXT: bx lr entry: @@ -401,8 +400,6 @@ ; ; CHECK-MVE-LABEL: vfma16_v1_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s14, #0 @@ -417,98 +414,96 @@ ; CHECK-MVE-NEXT: vmla.f16 s15, s14, s12 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s12, s13, s15 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s0 +; CHECK-MVE-NEXT: vmla.f16 s14, s4, s8 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s5 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s12, s4, s8 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s0, s12 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 +; CHECK-MVE-NEXT: vseleq.f16 s0, s0, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s1 +; CHECK-MVE-NEXT: vins.f16 s0, s12 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s9 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s5, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s1 +; CHECK-MVE-NEXT: vmov.f32 s8, s1 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s18, s5, s9 -; CHECK-MVE-NEXT: vseleq.f16 s13, s1, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s6 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 +; CHECK-MVE-NEXT: vmla.f16 s8, s5, s9 +; CHECK-MVE-NEXT: vseleq.f16 s1, s1, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s2 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s10 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s8, s2 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s2 +; CHECK-MVE-NEXT: vmla.f16 s8, s6, s10 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s18, s6, s10 -; CHECK-MVE-NEXT: vseleq.f16 s14, s2, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s7 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s6, #0 +; CHECK-MVE-NEXT: vseleq.f16 s2, s2, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s3 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s11 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 +; CHECK-MVE-NEXT: vmla.f16 s10, s6, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s7, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s10 ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s3 +; CHECK-MVE-NEXT: vmov.f32 s6, s3 ; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmla.f16 s18, s7, s11 +; CHECK-MVE-NEXT: vmla.f16 s6, s7, s11 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s3, s18 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s3, s6 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -534,8 +529,6 @@ ; ; CHECK-MVE-LABEL: vfma16_v2_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s14, #0 @@ -550,98 +543,96 @@ ; CHECK-MVE-NEXT: vmla.f16 s15, s14, s12 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s12, s13, s15 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s0 +; CHECK-MVE-NEXT: vmla.f16 s14, s4, s8 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s5 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s12, s4, s8 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s0, s12 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 +; CHECK-MVE-NEXT: vseleq.f16 s0, s0, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s1 +; CHECK-MVE-NEXT: vins.f16 s0, s12 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s9 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s5, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s1 +; CHECK-MVE-NEXT: vmov.f32 s8, s1 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s18, s5, s9 -; CHECK-MVE-NEXT: vseleq.f16 s13, s1, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s6 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 +; CHECK-MVE-NEXT: vmla.f16 s8, s5, s9 +; CHECK-MVE-NEXT: vseleq.f16 s1, s1, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s2 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s10 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s8, s2 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s2 +; CHECK-MVE-NEXT: vmla.f16 s8, s6, s10 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s18, s6, s10 -; CHECK-MVE-NEXT: vseleq.f16 s14, s2, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s7 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s6, #0 +; CHECK-MVE-NEXT: vseleq.f16 s2, s2, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s3 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s11 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 +; CHECK-MVE-NEXT: vmla.f16 s10, s6, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s7, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s10 ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s3 +; CHECK-MVE-NEXT: vmov.f32 s6, s3 ; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmla.f16 s18, s7, s11 +; CHECK-MVE-NEXT: vmla.f16 s6, s7, s11 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s3, s18 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s3, s6 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -667,8 +658,6 @@ ; ; CHECK-MVE-LABEL: vfms16_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s14, #0 @@ -683,98 +672,96 @@ ; CHECK-MVE-NEXT: vmls.f16 s15, s14, s12 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s12, s13, s15 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s0 +; CHECK-MVE-NEXT: vmls.f16 s14, s4, s8 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s5 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmls.f16 s12, s4, s8 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s0, s12 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 +; CHECK-MVE-NEXT: vseleq.f16 s0, s0, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s1 +; CHECK-MVE-NEXT: vins.f16 s0, s12 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s9 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmls.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmls.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s5, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s1 +; CHECK-MVE-NEXT: vmov.f32 s8, s1 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmls.f16 s18, s5, s9 -; CHECK-MVE-NEXT: vseleq.f16 s13, s1, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s6 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 +; CHECK-MVE-NEXT: vmls.f16 s8, s5, s9 +; CHECK-MVE-NEXT: vseleq.f16 s1, s1, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s2 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s10 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmls.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmls.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s8, s2 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s2 +; CHECK-MVE-NEXT: vmls.f16 s8, s6, s10 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmls.f16 s18, s6, s10 -; CHECK-MVE-NEXT: vseleq.f16 s14, s2, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s7 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s6, #0 +; CHECK-MVE-NEXT: vseleq.f16 s2, s2, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s3 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s11 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmls.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 +; CHECK-MVE-NEXT: vmls.f16 s10, s6, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s7, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s10 ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s3 +; CHECK-MVE-NEXT: vmov.f32 s6, s3 ; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmls.f16 s18, s7, s11 +; CHECK-MVE-NEXT: vmls.f16 s6, s7, s11 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s3, s18 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s3, s6 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -805,108 +792,107 @@ ; ; CHECK-MVE-LABEL: vfmar16_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcvtb.f16.f32 s12, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s4 -; CHECK-MVE-NEXT: vcmp.f16 s8, #0 +; CHECK-MVE-NEXT: vmovx.f16 s10, s4 ; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s10, #0 +; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s10, s0 -; CHECK-MVE-NEXT: vmov.f32 s14, s10 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s8, s8 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s14, s8, s12 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmla.f16 s14, s10, s8 ; CHECK-MVE-NEXT: vcmp.f16 s4, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s14 +; CHECK-MVE-NEXT: vseleq.f16 s10, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s8, s0 +; CHECK-MVE-NEXT: vmla.f16 s12, s4, s8 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s8, s4, s12 -; CHECK-MVE-NEXT: vseleq.f16 s8, s0, s8 -; CHECK-MVE-NEXT: movs r1, #0 -; CHECK-MVE-NEXT: vins.f16 s8, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmovx.f16 s13, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vseleq.f16 s0, s0, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vins.f16 s0, s10 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s13 +; CHECK-MVE-NEXT: vmovx.f16 s10, s1 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s15, s14, s12 +; CHECK-MVE-NEXT: vmov.f32 s12, s10 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmla.f16 s12, s4, s8 ; CHECK-MVE-NEXT: vcmp.f16 s5, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s10, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s1 +; CHECK-MVE-NEXT: vmov.f32 s10, s1 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s13, s5, s12 +; CHECK-MVE-NEXT: vmla.f16 s10, s5, s8 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s9, s1, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s2 -; CHECK-MVE-NEXT: vins.f16 s9, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s13 +; CHECK-MVE-NEXT: vseleq.f16 s1, s1, s10 +; CHECK-MVE-NEXT: vmovx.f16 s10, s2 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s15, s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s12, s4, s8 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s6, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s10, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s2 +; CHECK-MVE-NEXT: vmov.f32 s10, s2 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmla.f16 s10, s6, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s13, s6, s12 -; CHECK-MVE-NEXT: vseleq.f16 s10, s2, s13 +; CHECK-MVE-NEXT: vseleq.f16 s2, s2, s10 ; CHECK-MVE-NEXT: movs r1, #0 -; CHECK-MVE-NEXT: vins.f16 s10, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmovx.f16 s13, s3 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s13 +; CHECK-MVE-NEXT: vmov.f32 s10, s6 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s15, s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s10, s4, s8 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s7, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s10 ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s3 +; CHECK-MVE-NEXT: vmov.f32 s6, s3 ; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmla.f16 s13, s7, s12 +; CHECK-MVE-NEXT: vmla.f16 s6, s7, s8 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s11, s3, s13 -; CHECK-MVE-NEXT: vins.f16 s11, s14 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vseleq.f16 s3, s3, s6 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr entry: %src3 = fptrunc float %src3o to half @@ -942,104 +928,103 @@ ; CHECK-MVE-NEXT: vmovx.f16 s10, s4 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s10, #0 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s12, s8 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s14, s0 -; CHECK-MVE-NEXT: vmov.f32 s8, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s0 +; CHECK-MVE-NEXT: vmov.f32 s14, s8 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s8, s14, s10 +; CHECK-MVE-NEXT: vmla.f16 s14, s12, s10 ; CHECK-MVE-NEXT: vcmp.f16 s4, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s14, s8 +; CHECK-MVE-NEXT: vseleq.f16 s10, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s8, s12 +; CHECK-MVE-NEXT: vmla.f16 s12, s0, s4 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s8, s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s8, s0, s8 -; CHECK-MVE-NEXT: movs r1, #0 -; CHECK-MVE-NEXT: vins.f16 s8, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmovx.f16 s13, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vseleq.f16 s0, s0, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vins.f16 s0, s10 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s12 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s15, s13, s14 +; CHECK-MVE-NEXT: vmovx.f16 s10, s1 +; CHECK-MVE-NEXT: vmov.f32 s12, s8 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmla.f16 s12, s10, s4 ; CHECK-MVE-NEXT: vcmp.f16 s5, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s10, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s12 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s13, s1, s5 +; CHECK-MVE-NEXT: vmla.f16 s10, s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s9, s1, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s2 -; CHECK-MVE-NEXT: vins.f16 s9, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s12 +; CHECK-MVE-NEXT: vseleq.f16 s1, s1, s10 +; CHECK-MVE-NEXT: vmovx.f16 s10, s2 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s15, s13, s14 +; CHECK-MVE-NEXT: vmla.f16 s12, s10, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s6, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s10, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s12 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmla.f16 s10, s2, s6 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s13, s2, s6 -; CHECK-MVE-NEXT: vseleq.f16 s10, s2, s13 +; CHECK-MVE-NEXT: vseleq.f16 s2, s2, s10 ; CHECK-MVE-NEXT: movs r1, #0 -; CHECK-MVE-NEXT: vins.f16 s10, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s12 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s13, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s15, s13, s14 +; CHECK-MVE-NEXT: vmla.f16 s10, s6, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s7, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s10 ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmla.f16 s12, s3, s7 +; CHECK-MVE-NEXT: vmla.f16 s8, s3, s7 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s11, s3, s12 -; CHECK-MVE-NEXT: vins.f16 s11, s14 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vseleq.f16 s3, s3, s8 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr entry: %src3 = fptrunc float %src3o to half @@ -1068,51 +1053,50 @@ ; ; CHECK-MVE-LABEL: vfma32_v1_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: movs r2, #0 +; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f32 s4, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: mov.w r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: vmla.f32 s14, s4, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r2, #1 -; CHECK-MVE-NEXT: vmov.f32 s14, s0 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s2 +; CHECK-MVE-NEXT: vmla.f32 s12, s5, s9 ; CHECK-MVE-NEXT: cset r2, ne +; CHECK-MVE-NEXT: vmov.f32 s5, s2 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vcmp.f32 s7, #0 -; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r3, #1 ; CHECK-MVE-NEXT: cmp r3, #0 ; CHECK-MVE-NEXT: vcmp.f32 s6, #0 ; CHECK-MVE-NEXT: cset r3, ne -; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmla.f32 s13, s7, s11 ; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmla.f32 s8, s7, s11 ; CHECK-MVE-NEXT: cmp r3, #0 -; CHECK-MVE-NEXT: vmla.f32 s12, s5, s9 -; CHECK-MVE-NEXT: vmla.f32 s14, s4, s8 -; CHECK-MVE-NEXT: vmla.f32 s15, s6, s10 -; CHECK-MVE-NEXT: vseleq.f32 s7, s3, s13 +; CHECK-MVE-NEXT: vmla.f32 s5, s6, s10 +; CHECK-MVE-NEXT: vseleq.f32 s3, s3, s8 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f32 s6, s2, s15 +; CHECK-MVE-NEXT: vseleq.f32 s2, s2, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f32 s5, s1, s12 +; CHECK-MVE-NEXT: vseleq.f32 s1, s1, s12 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vseleq.f32 s4, s0, s14 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vseleq.f32 s0, s0, s14 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <4 x float> %src2, %src3 @@ -1138,51 +1122,50 @@ ; ; CHECK-MVE-LABEL: vfma32_v2_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: movs r2, #0 +; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f32 s4, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: mov.w r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: vmla.f32 s14, s4, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r2, #1 -; CHECK-MVE-NEXT: vmov.f32 s14, s0 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s2 +; CHECK-MVE-NEXT: vmla.f32 s12, s5, s9 ; CHECK-MVE-NEXT: cset r2, ne +; CHECK-MVE-NEXT: vmov.f32 s5, s2 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vcmp.f32 s7, #0 -; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r3, #1 ; CHECK-MVE-NEXT: cmp r3, #0 ; CHECK-MVE-NEXT: vcmp.f32 s6, #0 ; CHECK-MVE-NEXT: cset r3, ne -; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmla.f32 s13, s7, s11 ; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmla.f32 s8, s7, s11 ; CHECK-MVE-NEXT: cmp r3, #0 -; CHECK-MVE-NEXT: vmla.f32 s12, s5, s9 -; CHECK-MVE-NEXT: vmla.f32 s14, s4, s8 -; CHECK-MVE-NEXT: vmla.f32 s15, s6, s10 -; CHECK-MVE-NEXT: vseleq.f32 s7, s3, s13 +; CHECK-MVE-NEXT: vmla.f32 s5, s6, s10 +; CHECK-MVE-NEXT: vseleq.f32 s3, s3, s8 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f32 s6, s2, s15 +; CHECK-MVE-NEXT: vseleq.f32 s2, s2, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f32 s5, s1, s12 +; CHECK-MVE-NEXT: vseleq.f32 s1, s1, s12 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vseleq.f32 s4, s0, s14 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vseleq.f32 s0, s0, s14 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <4 x float> %src2, %src3 @@ -1208,51 +1191,50 @@ ; ; CHECK-MVE-LABEL: vfms32_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: movs r2, #0 +; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f32 s4, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: mov.w r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: vmls.f32 s14, s4, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r2, #1 -; CHECK-MVE-NEXT: vmov.f32 s14, s0 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s2 +; CHECK-MVE-NEXT: vmls.f32 s12, s5, s9 ; CHECK-MVE-NEXT: cset r2, ne +; CHECK-MVE-NEXT: vmov.f32 s5, s2 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vcmp.f32 s7, #0 -; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r3, #1 ; CHECK-MVE-NEXT: cmp r3, #0 ; CHECK-MVE-NEXT: vcmp.f32 s6, #0 ; CHECK-MVE-NEXT: cset r3, ne -; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmls.f32 s13, s7, s11 ; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmls.f32 s8, s7, s11 ; CHECK-MVE-NEXT: cmp r3, #0 -; CHECK-MVE-NEXT: vmls.f32 s12, s5, s9 -; CHECK-MVE-NEXT: vmls.f32 s14, s4, s8 -; CHECK-MVE-NEXT: vmls.f32 s15, s6, s10 -; CHECK-MVE-NEXT: vseleq.f32 s7, s3, s13 +; CHECK-MVE-NEXT: vmls.f32 s5, s6, s10 +; CHECK-MVE-NEXT: vseleq.f32 s3, s3, s8 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f32 s6, s2, s15 +; CHECK-MVE-NEXT: vseleq.f32 s2, s2, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f32 s5, s1, s12 +; CHECK-MVE-NEXT: vseleq.f32 s1, s1, s12 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vseleq.f32 s4, s0, s14 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vseleq.f32 s0, s0, s14 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <4 x float> %src2, %src3 @@ -1281,8 +1263,10 @@ ; ; CHECK-MVE-LABEL: vfmar32_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: vmov.f32 s10, s1 ; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: movs r2, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -1291,17 +1275,16 @@ ; CHECK-MVE-NEXT: vmov.f32 s14, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: mov.w r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s10, s1 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r2, #1 -; CHECK-MVE-NEXT: vmov.f32 s12, s0 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s9, s2 +; CHECK-MVE-NEXT: vmla.f32 s10, s5, s8 +; CHECK-MVE-NEXT: vmov.f32 s5, s2 ; CHECK-MVE-NEXT: cset r2, ne ; CHECK-MVE-NEXT: vcmp.f32 s7, #0 ; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: vmov.f32 s12, s0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r3, #1 ; CHECK-MVE-NEXT: cmp r3, #0 @@ -1312,20 +1295,18 @@ ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmla.f32 s14, s7, s8 ; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmla.f32 s14, s7, s8 ; CHECK-MVE-NEXT: cmp r3, #0 -; CHECK-MVE-NEXT: vmla.f32 s10, s5, s8 -; CHECK-MVE-NEXT: vmla.f32 s12, s4, s8 -; CHECK-MVE-NEXT: vmla.f32 s9, s6, s8 -; CHECK-MVE-NEXT: vseleq.f32 s7, s3, s14 +; CHECK-MVE-NEXT: vmla.f32 s5, s6, s8 +; CHECK-MVE-NEXT: vseleq.f32 s3, s3, s14 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f32 s6, s2, s9 +; CHECK-MVE-NEXT: vseleq.f32 s2, s2, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f32 s5, s1, s10 +; CHECK-MVE-NEXT: vseleq.f32 s1, s1, s10 +; CHECK-MVE-NEXT: vmla.f32 s12, s4, s8 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vseleq.f32 s4, s0, s12 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vseleq.f32 s0, s0, s12 ; CHECK-MVE-NEXT: bx lr entry: %i = insertelement <4 x float> undef, float %src3, i32 0 @@ -1366,15 +1347,15 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s10, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r2, #1 -; CHECK-MVE-NEXT: vmov.f32 s12, s8 ; CHECK-MVE-NEXT: cmp r2, #0 ; CHECK-MVE-NEXT: vcmp.f32 s7, #0 ; CHECK-MVE-NEXT: cset r2, ne +; CHECK-MVE-NEXT: vmov.f32 s10, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r3, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r3, #1 ; CHECK-MVE-NEXT: cmp r3, #0 @@ -1388,17 +1369,16 @@ ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: vmla.f32 s14, s3, s7 ; CHECK-MVE-NEXT: cmp r3, #0 -; CHECK-MVE-NEXT: vmla.f32 s10, s1, s5 -; CHECK-MVE-NEXT: vmla.f32 s12, s0, s4 ; CHECK-MVE-NEXT: vmla.f32 s8, s2, s6 -; CHECK-MVE-NEXT: vseleq.f32 s7, s3, s14 +; CHECK-MVE-NEXT: vseleq.f32 s3, s3, s14 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f32 s6, s2, s8 +; CHECK-MVE-NEXT: vseleq.f32 s2, s2, s8 +; CHECK-MVE-NEXT: vmla.f32 s10, s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f32 s5, s1, s10 +; CHECK-MVE-NEXT: vmla.f32 s12, s0, s4 +; CHECK-MVE-NEXT: vseleq.f32 s1, s1, s10 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vseleq.f32 s4, s0, s12 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vseleq.f32 s0, s0, s12 ; CHECK-MVE-NEXT: bx lr entry: %i = insertelement <4 x float> undef, float %src3, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll --- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -5,11 +5,10 @@ define arm_aapcs_vfpcc <4 x float> @sqrt_float32_t(<4 x float> %src) { ; CHECK-LABEL: sqrt_float32_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vsqrt.f32 s7, s3 -; CHECK-NEXT: vsqrt.f32 s6, s2 -; CHECK-NEXT: vsqrt.f32 s5, s1 -; CHECK-NEXT: vsqrt.f32 s4, s0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vsqrt.f32 s3, s3 +; CHECK-NEXT: vsqrt.f32 s2, s2 +; CHECK-NEXT: vsqrt.f32 s1, s1 +; CHECK-NEXT: vsqrt.f32 s0, s0 ; CHECK-NEXT: bx lr entry: %0 = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %src) @@ -19,23 +18,22 @@ define arm_aapcs_vfpcc <8 x half> @sqrt_float16_t(<8 x half> %src) { ; CHECK-LABEL: sqrt_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vsqrt.f16 s8, s0 -; CHECK-NEXT: vsqrt.f16 s0, s4 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vsqrt.f16 s8, s8 -; CHECK-NEXT: vsqrt.f16 s1, s5 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vsqrt.f16 s8, s8 -; CHECK-NEXT: vsqrt.f16 s2, s6 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vsqrt.f16 s8, s8 -; CHECK-NEXT: vsqrt.f16 s3, s7 -; CHECK-NEXT: vins.f16 s3, s8 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vsqrt.f16 s0, s0 +; CHECK-NEXT: vsqrt.f16 s4, s4 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vsqrt.f16 s4, s4 +; CHECK-NEXT: vsqrt.f16 s1, s1 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vsqrt.f16 s4, s4 +; CHECK-NEXT: vsqrt.f16 s2, s2 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vsqrt.f16 s4, s4 +; CHECK-NEXT: vsqrt.f16 s3, s3 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %0 = call fast <8 x half> @llvm.sqrt.v8f16(<8 x half> %src) @@ -101,52 +99,52 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.cos.v8f16(<8 x half> %src) @@ -212,52 +210,52 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.sin.v8f16(<8 x half> %src) @@ -323,52 +321,52 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.exp.v8f16(<8 x half> %src) @@ -434,52 +432,52 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.exp2.v8f16(<8 x half> %src) @@ -545,52 +543,52 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.log.v8f16(<8 x half> %src) @@ -656,52 +654,52 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.log2.v8f16(<8 x half> %src) @@ -767,52 +765,52 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.log10.v8f16(<8 x half> %src) @@ -883,8 +881,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q5, q0 ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vcvtb.f32.f16 s0, s20 @@ -893,59 +891,59 @@ ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vcvtt.f32.f16 s0, s20 -; CHECK-NEXT: vmov s24, r0 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s24, s24 -; CHECK-NEXT: vcvtt.f16.f32 s24, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s21 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s25, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s21 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s25, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s22 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s26, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s22 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s26, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s23 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s27, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s23 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s27, s0 -; CHECK-NEXT: vmov q0, q6 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.pow.v8f16(<8 x half> %src1, <8 x half> %src2) @@ -996,8 +994,8 @@ ; CHECK-NEXT: bfi r5, r1, #31, #1 ; CHECK-NEXT: lsr.w r1, r12, #31 ; CHECK-NEXT: bfi r3, r1, #31, #1 -; CHECK-NEXT: vmov s3, r5 ; CHECK-NEXT: vmov s2, r4 +; CHECK-NEXT: vmov s3, r5 ; CHECK-NEXT: vmov s1, r0 ; CHECK-NEXT: vmov s0, r3 ; CHECK-NEXT: pop {r4, r5, r7, pc} @@ -1013,81 +1011,80 @@ ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmovx.f16 s8, s4 ; CHECK-NEXT: vstr.16 s8, [sp, #24] -; CHECK-NEXT: vmovx.f16 s8, s5 ; CHECK-NEXT: vstr.16 s4, [sp, #28] -; CHECK-NEXT: vstr.16 s8, [sp, #16] -; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vstr.16 s4, [sp, #16] +; CHECK-NEXT: vmovx.f16 s4, s6 ; CHECK-NEXT: vstr.16 s5, [sp, #20] -; CHECK-NEXT: vstr.16 s8, [sp, #8] -; CHECK-NEXT: vmovx.f16 s8, s7 +; CHECK-NEXT: vstr.16 s4, [sp, #8] +; CHECK-NEXT: vmovx.f16 s4, s7 ; CHECK-NEXT: vstr.16 s6, [sp, #12] -; CHECK-NEXT: vstr.16 s8, [sp] +; CHECK-NEXT: vstr.16 s4, [sp] ; CHECK-NEXT: vstr.16 s7, [sp, #4] -; CHECK-NEXT: vmovx.f16 s4, s0 ; CHECK-NEXT: ldrb.w r0, [sp, #25] +; CHECK-NEXT: vmovx.f16 s4, s0 ; CHECK-NEXT: vabs.f16 s4, s4 -; CHECK-NEXT: vneg.f16 s6, s4 +; CHECK-NEXT: vabs.f16 s0, s0 ; CHECK-NEXT: tst.w r0, #128 +; CHECK-NEXT: vneg.f16 s6, s4 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #29] -; CHECK-NEXT: vseleq.f16 s8, s4, s6 -; CHECK-NEXT: vabs.f16 s4, s0 +; CHECK-NEXT: vseleq.f16 s4, s4, s6 +; CHECK-NEXT: vneg.f16 s6, s0 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vneg.f16 s6, s4 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vabs.f16 s0, s3 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #17] -; CHECK-NEXT: vseleq.f16 s4, s4, s6 +; CHECK-NEXT: vseleq.f16 s0, s0, s6 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vabs.f16 s8, s8 +; CHECK-NEXT: vabs.f16 s4, s4 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #21] -; CHECK-NEXT: vneg.f16 s10, s8 -; CHECK-NEXT: vseleq.f16 s8, s8, s10 -; CHECK-NEXT: vabs.f16 s10, s1 +; CHECK-NEXT: vneg.f16 s6, s4 +; CHECK-NEXT: vseleq.f16 s4, s4, s6 +; CHECK-NEXT: vabs.f16 s6, s1 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vneg.f16 s12, s10 +; CHECK-NEXT: vneg.f16 s8, s6 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #9] -; CHECK-NEXT: vseleq.f16 s5, s10, s12 +; CHECK-NEXT: vseleq.f16 s1, s6, s8 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s2 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vabs.f16 s8, s8 +; CHECK-NEXT: vabs.f16 s4, s4 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #13] -; CHECK-NEXT: vneg.f16 s10, s8 -; CHECK-NEXT: vseleq.f16 s8, s8, s10 -; CHECK-NEXT: vabs.f16 s10, s2 +; CHECK-NEXT: vneg.f16 s6, s4 +; CHECK-NEXT: vseleq.f16 s4, s4, s6 +; CHECK-NEXT: vabs.f16 s2, s2 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vneg.f16 s12, s10 +; CHECK-NEXT: vneg.f16 s6, s2 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vneg.f16 s2, s0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #1] -; CHECK-NEXT: vseleq.f16 s6, s10, s12 +; CHECK-NEXT: vseleq.f16 s2, s2, s6 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vabs.f16 s8, s8 +; CHECK-NEXT: vabs.f16 s4, s4 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #5] -; CHECK-NEXT: vneg.f16 s10, s8 -; CHECK-NEXT: vseleq.f16 s8, s8, s10 +; CHECK-NEXT: vneg.f16 s6, s4 +; CHECK-NEXT: vseleq.f16 s4, s4, s6 +; CHECK-NEXT: vabs.f16 s6, s3 ; CHECK-NEXT: tst.w r0, #128 +; CHECK-NEXT: vneg.f16 s8, s6 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vseleq.f16 s7, s0, s2 -; CHECK-NEXT: vins.f16 s7, s8 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vseleq.f16 s3, s6, s8 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll --- a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll @@ -5,23 +5,22 @@ define arm_aapcs_vfpcc <8 x half> @fneg_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: fneg_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vneg.f16 s8, s0 -; CHECK-MVE-NEXT: vneg.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vneg.f16 s8, s8 -; CHECK-MVE-NEXT: vneg.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vneg.f16 s8, s8 -; CHECK-MVE-NEXT: vneg.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vneg.f16 s8, s8 -; CHECK-MVE-NEXT: vneg.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vneg.f16 s0, s0 +; CHECK-MVE-NEXT: vneg.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vneg.f16 s4, s4 +; CHECK-MVE-NEXT: vneg.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vneg.f16 s4, s4 +; CHECK-MVE-NEXT: vneg.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vneg.f16 s4, s4 +; CHECK-MVE-NEXT: vneg.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fneg_float16_t: @@ -36,11 +35,10 @@ define arm_aapcs_vfpcc <4 x float> @fneg_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: fneg_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vneg.f32 s7, s3 -; CHECK-MVE-NEXT: vneg.f32 s6, s2 -; CHECK-MVE-NEXT: vneg.f32 s5, s1 -; CHECK-MVE-NEXT: vneg.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vneg.f32 s3, s3 +; CHECK-MVE-NEXT: vneg.f32 s2, s2 +; CHECK-MVE-NEXT: vneg.f32 s1, s1 +; CHECK-MVE-NEXT: vneg.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fneg_float32_t: @@ -77,23 +75,22 @@ define arm_aapcs_vfpcc <8 x half> @fabs_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: fabs_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vabs.f16 s8, s0 -; CHECK-MVE-NEXT: vabs.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vabs.f16 s8, s8 -; CHECK-MVE-NEXT: vabs.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vabs.f16 s8, s8 -; CHECK-MVE-NEXT: vabs.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vabs.f16 s8, s8 -; CHECK-MVE-NEXT: vabs.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vabs.f16 s0, s0 +; CHECK-MVE-NEXT: vabs.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vabs.f16 s4, s4 +; CHECK-MVE-NEXT: vabs.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vabs.f16 s4, s4 +; CHECK-MVE-NEXT: vabs.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vabs.f16 s4, s4 +; CHECK-MVE-NEXT: vabs.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fabs_float16_t: @@ -108,11 +105,10 @@ define arm_aapcs_vfpcc <4 x float> @fabs_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: fabs_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vabs.f32 s7, s3 -; CHECK-MVE-NEXT: vabs.f32 s6, s2 -; CHECK-MVE-NEXT: vabs.f32 s5, s1 -; CHECK-MVE-NEXT: vabs.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vabs.f32 s3, s3 +; CHECK-MVE-NEXT: vabs.f32 s2, s2 +; CHECK-MVE-NEXT: vabs.f32 s1, s1 +; CHECK-MVE-NEXT: vabs.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fabs_float32_t: diff --git a/llvm/test/CodeGen/Thumb2/mve-frint.ll b/llvm/test/CodeGen/Thumb2/mve-frint.ll --- a/llvm/test/CodeGen/Thumb2/mve-frint.ll +++ b/llvm/test/CodeGen/Thumb2/mve-frint.ll @@ -5,11 +5,10 @@ define arm_aapcs_vfpcc <4 x float> @fceil_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: fceil_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vrintp.f32 s7, s3 -; CHECK-MVE-NEXT: vrintp.f32 s6, s2 -; CHECK-MVE-NEXT: vrintp.f32 s5, s1 -; CHECK-MVE-NEXT: vrintp.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vrintp.f32 s3, s3 +; CHECK-MVE-NEXT: vrintp.f32 s2, s2 +; CHECK-MVE-NEXT: vrintp.f32 s1, s1 +; CHECK-MVE-NEXT: vrintp.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fceil_float32_t: @@ -24,23 +23,22 @@ define arm_aapcs_vfpcc <8 x half> @fceil_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: fceil_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vrintp.f16 s8, s0 -; CHECK-MVE-NEXT: vrintp.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vrintp.f16 s8, s8 -; CHECK-MVE-NEXT: vrintp.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vrintp.f16 s8, s8 -; CHECK-MVE-NEXT: vrintp.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vrintp.f16 s8, s8 -; CHECK-MVE-NEXT: vrintp.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vrintp.f16 s0, s0 +; CHECK-MVE-NEXT: vrintp.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vrintp.f16 s4, s4 +; CHECK-MVE-NEXT: vrintp.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vrintp.f16 s4, s4 +; CHECK-MVE-NEXT: vrintp.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vrintp.f16 s4, s4 +; CHECK-MVE-NEXT: vrintp.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fceil_float16_t: @@ -79,11 +77,10 @@ define arm_aapcs_vfpcc <4 x float> @ftrunc_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: ftrunc_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vrintz.f32 s7, s3 -; CHECK-MVE-NEXT: vrintz.f32 s6, s2 -; CHECK-MVE-NEXT: vrintz.f32 s5, s1 -; CHECK-MVE-NEXT: vrintz.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vrintz.f32 s3, s3 +; CHECK-MVE-NEXT: vrintz.f32 s2, s2 +; CHECK-MVE-NEXT: vrintz.f32 s1, s1 +; CHECK-MVE-NEXT: vrintz.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: ftrunc_float32_t: @@ -98,23 +95,22 @@ define arm_aapcs_vfpcc <8 x half> @ftrunc_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: ftrunc_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vrintz.f16 s8, s0 -; CHECK-MVE-NEXT: vrintz.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vrintz.f16 s8, s8 -; CHECK-MVE-NEXT: vrintz.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vrintz.f16 s8, s8 -; CHECK-MVE-NEXT: vrintz.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vrintz.f16 s8, s8 -; CHECK-MVE-NEXT: vrintz.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vrintz.f16 s0, s0 +; CHECK-MVE-NEXT: vrintz.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vrintz.f16 s4, s4 +; CHECK-MVE-NEXT: vrintz.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vrintz.f16 s4, s4 +; CHECK-MVE-NEXT: vrintz.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vrintz.f16 s4, s4 +; CHECK-MVE-NEXT: vrintz.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: ftrunc_float16_t: @@ -153,11 +149,10 @@ define arm_aapcs_vfpcc <4 x float> @frint_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: frint_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vrintx.f32 s7, s3 -; CHECK-MVE-NEXT: vrintx.f32 s6, s2 -; CHECK-MVE-NEXT: vrintx.f32 s5, s1 -; CHECK-MVE-NEXT: vrintx.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vrintx.f32 s3, s3 +; CHECK-MVE-NEXT: vrintx.f32 s2, s2 +; CHECK-MVE-NEXT: vrintx.f32 s1, s1 +; CHECK-MVE-NEXT: vrintx.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: frint_float32_t: @@ -172,23 +167,22 @@ define arm_aapcs_vfpcc <8 x half> @frint_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: frint_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vrintx.f16 s8, s0 -; CHECK-MVE-NEXT: vrintx.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vrintx.f16 s8, s8 -; CHECK-MVE-NEXT: vrintx.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vrintx.f16 s8, s8 -; CHECK-MVE-NEXT: vrintx.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vrintx.f16 s8, s8 -; CHECK-MVE-NEXT: vrintx.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vrintx.f16 s0, s0 +; CHECK-MVE-NEXT: vrintx.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vrintx.f16 s4, s4 +; CHECK-MVE-NEXT: vrintx.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vrintx.f16 s4, s4 +; CHECK-MVE-NEXT: vrintx.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vrintx.f16 s4, s4 +; CHECK-MVE-NEXT: vrintx.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: frint_float16_t: @@ -227,11 +221,10 @@ define arm_aapcs_vfpcc <4 x float> @fnearbyint_float32_t(<4 x float> %src) { ; CHECK-LABEL: fnearbyint_float32_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vrintr.f32 s7, s3 -; CHECK-NEXT: vrintr.f32 s6, s2 -; CHECK-NEXT: vrintr.f32 s5, s1 -; CHECK-NEXT: vrintr.f32 s4, s0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vrintr.f32 s3, s3 +; CHECK-NEXT: vrintr.f32 s2, s2 +; CHECK-NEXT: vrintr.f32 s1, s1 +; CHECK-NEXT: vrintr.f32 s0, s0 ; CHECK-NEXT: bx lr entry: %0 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %src) @@ -241,23 +234,22 @@ define arm_aapcs_vfpcc <8 x half> @fnearbyint_float16_t(<8 x half> %src) { ; CHECK-LABEL: fnearbyint_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vrintr.f16 s8, s0 -; CHECK-NEXT: vrintr.f16 s0, s4 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vrintr.f16 s8, s8 -; CHECK-NEXT: vrintr.f16 s1, s5 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vrintr.f16 s8, s8 -; CHECK-NEXT: vrintr.f16 s2, s6 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vrintr.f16 s8, s8 -; CHECK-NEXT: vrintr.f16 s3, s7 -; CHECK-NEXT: vins.f16 s3, s8 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vrintr.f16 s0, s0 +; CHECK-NEXT: vrintr.f16 s4, s4 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vrintr.f16 s4, s4 +; CHECK-NEXT: vrintr.f16 s1, s1 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vrintr.f16 s4, s4 +; CHECK-NEXT: vrintr.f16 s2, s2 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vrintr.f16 s4, s4 +; CHECK-NEXT: vrintr.f16 s3, s3 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %0 = call fast <8 x half> @llvm.nearbyint.v8f16(<8 x half> %src) @@ -291,11 +283,10 @@ define arm_aapcs_vfpcc <4 x float> @ffloor_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: ffloor_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vrintm.f32 s7, s3 -; CHECK-MVE-NEXT: vrintm.f32 s6, s2 -; CHECK-MVE-NEXT: vrintm.f32 s5, s1 -; CHECK-MVE-NEXT: vrintm.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vrintm.f32 s3, s3 +; CHECK-MVE-NEXT: vrintm.f32 s2, s2 +; CHECK-MVE-NEXT: vrintm.f32 s1, s1 +; CHECK-MVE-NEXT: vrintm.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: ffloor_float32_t: @@ -310,23 +301,22 @@ define arm_aapcs_vfpcc <8 x half> @ffloor_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: ffloor_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vrintm.f16 s8, s0 -; CHECK-MVE-NEXT: vrintm.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vrintm.f16 s8, s8 -; CHECK-MVE-NEXT: vrintm.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vrintm.f16 s8, s8 -; CHECK-MVE-NEXT: vrintm.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vrintm.f16 s8, s8 -; CHECK-MVE-NEXT: vrintm.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vrintm.f16 s0, s0 +; CHECK-MVE-NEXT: vrintm.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vrintm.f16 s4, s4 +; CHECK-MVE-NEXT: vrintm.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vrintm.f16 s4, s4 +; CHECK-MVE-NEXT: vrintm.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vrintm.f16 s4, s4 +; CHECK-MVE-NEXT: vrintm.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: ffloor_float16_t: @@ -365,11 +355,10 @@ define arm_aapcs_vfpcc <4 x float> @fround_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: fround_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vrinta.f32 s7, s3 -; CHECK-MVE-NEXT: vrinta.f32 s6, s2 -; CHECK-MVE-NEXT: vrinta.f32 s5, s1 -; CHECK-MVE-NEXT: vrinta.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vrinta.f32 s3, s3 +; CHECK-MVE-NEXT: vrinta.f32 s2, s2 +; CHECK-MVE-NEXT: vrinta.f32 s1, s1 +; CHECK-MVE-NEXT: vrinta.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fround_float32_t: @@ -384,23 +373,22 @@ define arm_aapcs_vfpcc <8 x half> @fround_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: fround_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vrinta.f16 s8, s0 -; CHECK-MVE-NEXT: vrinta.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vrinta.f16 s8, s8 -; CHECK-MVE-NEXT: vrinta.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vrinta.f16 s8, s8 -; CHECK-MVE-NEXT: vrinta.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vrinta.f16 s8, s8 -; CHECK-MVE-NEXT: vrinta.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vrinta.f16 s0, s0 +; CHECK-MVE-NEXT: vrinta.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vrinta.f16 s4, s4 +; CHECK-MVE-NEXT: vrinta.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vrinta.f16 s4, s4 +; CHECK-MVE-NEXT: vrinta.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vrinta.f16 s4, s4 +; CHECK-MVE-NEXT: vrinta.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fround_float16_t: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -89,23 +89,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vldr.16 s8, [r3] -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vldr.16 s4, [r3] -; CHECK-NEXT: vldr.16 s1, [r2] -; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vins.f16 s0, s4 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vldr.16 s2, [r3] +; CHECK-NEXT: vldr.16 s1, [r2] ; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vins.f16 s1, s2 ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vldr.16 s8, [r1] +; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vins.f16 s2, s4 ; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vldr.16 s3, [r0] ; CHECK-NEXT: vins.f16 s3, s4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -172,10 +172,10 @@ ; CHECK-NEXT: vldr s3, [r2] ; CHECK-NEXT: vldr s2, [r12] ; CHECK-NEXT: vldr s1, [r1] +; CHECK-NEXT: vldr s0, [lr] ; CHECK-NEXT: vldr s7, [r3] ; CHECK-NEXT: vldr s6, [r0] ; CHECK-NEXT: vldr s5, [r5] -; CHECK-NEXT: vldr s0, [lr] ; CHECK-NEXT: vldr s4, [r4] ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -413,23 +413,23 @@ define arm_aapcs_vfpcc <8 x half> @ptr_f16(<8 x half*>* %offptr) { ; CHECK-LABEL: ptr_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vldr.16 s8, [r2] -; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov r1, r2, d3 -; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: vldr.16 s4, [r2] -; CHECK-NEXT: vldr.16 s1, [r1] -; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vins.f16 s0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldr.16 s1, [r1] +; CHECK-NEXT: vldr.16 s2, [r2] ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vldr.16 s8, [r1] +; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vldr.16 s3, [r0] +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: @@ -441,15 +441,15 @@ define arm_aapcs_vfpcc <4 x half> @ptr_v4f16(<4 x half*>* %offptr) { ; CHECK-LABEL: ptr_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vldr.16 s8, [r1] -; CHECK-NEXT: vldr.16 s0, [r0] -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vldr.16 s4, [r1] +; CHECK-NEXT: vldr.16 s0, [r0] +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vldr.16 s2, [r1] ; CHECK-NEXT: vldr.16 s1, [r0] -; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vins.f16 s1, s2 ; CHECK-NEXT: bx lr entry: %offs = load <4 x half*>, <4 x half*>* %offptr, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll @@ -81,8 +81,6 @@ define arm_aapcs_vfpcc void @test_vst2q_u32(i32* %addr, %struct.uint32x4x2_t %value.coerce) { ; CHECK-LABEL: test_vst2q_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: vst20.32 {q0, q1}, [r0] ; CHECK-NEXT: vst21.32 {q0, q1}, [r0] ; CHECK-NEXT: bx lr @@ -97,8 +95,6 @@ define arm_aapcs_vfpcc i32* @test_vst2q_u32_post(i32* %addr, %struct.uint32x4x2_t %value.coerce) { ; CHECK-LABEL: test_vst2q_u32_post: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: vst20.32 {q0, q1}, [r0] ; CHECK-NEXT: vst21.32 {q0, q1}, [r0]! ; CHECK-NEXT: bx lr @@ -116,8 +112,6 @@ define arm_aapcs_vfpcc void @test_vst2q_f16(half* %addr, %struct.float16x8x2_t %value.coerce) { ; CHECK-LABEL: test_vst2q_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: vst20.16 {q0, q1}, [r0] ; CHECK-NEXT: vst21.16 {q0, q1}, [r0] ; CHECK-NEXT: bx lr @@ -132,8 +126,6 @@ define arm_aapcs_vfpcc half* @test_vst2q_f16_post(half* %addr, %struct.float16x8x2_t %value.coerce) { ; CHECK-LABEL: test_vst2q_f16_post: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: vst20.16 {q0, q1}, [r0] ; CHECK-NEXT: vst21.16 {q0, q1}, [r0]! ; CHECK-NEXT: bx lr @@ -151,10 +143,6 @@ define arm_aapcs_vfpcc void @test_vst4q_s8(i8* %addr, %struct.int8x16x4_t %value.coerce) { ; CHECK-LABEL: test_vst4q_s8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r0] @@ -175,10 +163,6 @@ define arm_aapcs_vfpcc i8* @test_vst4q_s8_post(i8* %addr, %struct.int8x16x4_t %value.coerce) { ; CHECK-LABEL: test_vst4q_s8_post: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll @@ -6,55 +6,55 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vldrw.u32 q5, [r2] ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s10, s7 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vand q3, q2, q0 ; CHECK-NEXT: vand q0, q1, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r4, r1, d4 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov.f32 s16, s22 -; CHECK-NEXT: vmov.f32 s18, s23 ; CHECK-NEXT: vmov r3, lr, d0 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r0, r12, d5 -; CHECK-NEXT: vmov.f32 s8, s20 -; CHECK-NEXT: vmov.f32 s10, s21 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov r4, r1, d6 +; CHECK-NEXT: vmov r0, r12, d7 +; CHECK-NEXT: vldrw.u32 q3, [r2] +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov.f32 s0, s12 +; CHECK-NEXT: vmov.f32 s6, s13 ; CHECK-NEXT: adds r2, r5, r4 -; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r4, s8 ; CHECK-NEXT: asr.w r6, r5, #31 ; CHECK-NEXT: adcs r1, r6 ; CHECK-NEXT: asrl r2, r1, r4 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: adds r6, r1, r3 -; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: asr.w r4, r1, #31 ; CHECK-NEXT: adc.w r1, r4, lr ; CHECK-NEXT: asrl r6, r1, r3 ; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 ; CHECK-NEXT: adds r0, r0, r1 ; CHECK-NEXT: asr.w r3, r1, #31 ; CHECK-NEXT: adc.w r1, r3, r12 ; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: asrl r0, r1, r3 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: adds r6, r1, r5 ; CHECK-NEXT: asr.w r2, r1, #31 ; CHECK-NEXT: adc.w r1, r2, r4 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: asrl r6, r1, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r6, r0 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %a = load <4 x i32>, <4 x i32> *%A, align 4 @@ -142,30 +142,30 @@ ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vldrw.u32 q5, [r2] -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vmov.f32 s10, s9 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vand q3, q2, q0 ; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: vand q2, q2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r5, r1, d2 -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov.f32 s16, s22 -; CHECK-NEXT: vmov.f32 s18, s23 -; CHECK-NEXT: vmov r4, lr, d4 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r0, r12, d3 -; CHECK-NEXT: vmov.f32 s4, s20 -; CHECK-NEXT: vmov.f32 s6, s21 +; CHECK-NEXT: vmov r4, lr, d2 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov r5, r1, d6 +; CHECK-NEXT: vmov r0, r12, d7 +; CHECK-NEXT: vldrw.u32 q3, [r2] +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vmov.f32 s4, s12 +; CHECK-NEXT: vmov.f32 s2, s13 ; CHECK-NEXT: adds r2, r6, r5 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r5, s8 ; CHECK-NEXT: asr.w r7, r6, #31 ; CHECK-NEXT: adcs r1, r7 ; CHECK-NEXT: asrl r2, r1, r5 @@ -175,23 +175,23 @@ ; CHECK-NEXT: asr.w r5, r1, #31 ; CHECK-NEXT: adc.w r1, r5, lr ; CHECK-NEXT: asrl r4, r1, r7 -; CHECK-NEXT: vmov r6, r5, d5 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 +; CHECK-NEXT: vmov r6, r5, d3 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r2 ; CHECK-NEXT: adds r0, r0, r1 ; CHECK-NEXT: asr.w r7, r1, #31 ; CHECK-NEXT: adc.w r1, r7, r12 ; CHECK-NEXT: vmov r7, s18 ; CHECK-NEXT: asrl r0, r1, r7 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: adds r6, r6, r1 ; CHECK-NEXT: asr.w r2, r1, #31 ; CHECK-NEXT: adc.w r1, r2, r5 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: asrl r6, r1, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r6, r0 -; CHECK-NEXT: vstrw.32 q2, [r3] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmov q1[3], q1[1], r6, r0 +; CHECK-NEXT: vstrw.32 q1, [r3] +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: @@ -280,9 +280,9 @@ ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: adds.w r12, r2, r2 ; CHECK-NEXT: asr.w r3, r2, #31 ; CHECK-NEXT: adc.w r7, r3, r2, asr #31 @@ -370,26 +370,24 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: ldr.w lr, [sp, #20] -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov.f32 s12, s4 -; CHECK-NEXT: vmov.f32 s14, s5 ; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s6, s7 ; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: smull r12, r3, r1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmullb.s32 q2, q1, q0 ; CHECK-NEXT: asrl r12, r3, r2 -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmullb.s32 q1, q0, q2 -; CHECK-NEXT: vmov r6, r1, d2 -; CHECK-NEXT: vmov r4, r7, d3 +; CHECK-NEXT: vmov r6, r1, d4 +; CHECK-NEXT: vmov r4, r7, d5 ; CHECK-NEXT: asrl r6, r1, r2 ; CHECK-NEXT: asrl r4, r7, r2 ; CHECK-NEXT: smull r0, r5, r5, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -65,20 +65,20 @@ ; CHECK-LABEL: ext_add_trunc_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s6, s7 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: add.w r12, r1, r0 -; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: add r1, r2 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: add r0, r3 @@ -184,17 +184,17 @@ ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r0, r1, d6 ; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.f32 s10, s1 ; CHECK-NEXT: vmov r12, lr, d7 +; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: adds r0, r0, r4 ; CHECK-NEXT: asr.w r5, r4, #31 ; CHECK-NEXT: adcs r1, r5 @@ -205,9 +205,9 @@ ; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: lsrl r2, r3, #1 ; CHECK-NEXT: vmov r1, r5, d3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: adds.w r4, r3, r12 ; CHECK-NEXT: asr.w r6, r3, #31 ; CHECK-NEXT: adc.w r3, r6, lr @@ -216,8 +216,7 @@ ; CHECK-NEXT: adc.w r1, r2, r5 ; CHECK-NEXT: lsrl r4, r3, #1 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r4 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r4 ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %sa = sext <4 x i32> %a to <4 x i64> @@ -346,11 +345,11 @@ ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vand q1, q1, q3 ; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vmov r12, r2, d5 ; CHECK-NEXT: vmov r8, r9, d3 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: vmov lr, s2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: adds.w r4, r1, r12 @@ -359,21 +358,21 @@ ; CHECK-NEXT: asrl r4, r5, r12 ; CHECK-NEXT: subs.w r0, r4, r12 ; CHECK-NEXT: sbc.w r2, r5, r2 -; CHECK-NEXT: asr.w r5, lr, #31 ; CHECK-NEXT: umull r0, r4, r0, r12 ; CHECK-NEXT: adds.w r6, lr, r8 +; CHECK-NEXT: mla r3, r2, r12, r4 +; CHECK-NEXT: asr.w r5, lr, #31 ; CHECK-NEXT: adc.w r5, r5, r9 +; CHECK-NEXT: rsbs r2, r1, #0 ; CHECK-NEXT: asrl r6, r5, r8 -; CHECK-NEXT: mla r3, r2, r12, r4 +; CHECK-NEXT: lsll r0, r3, r2 ; CHECK-NEXT: subs.w r7, r6, r8 +; CHECK-NEXT: vmov r6, r2, d4 ; CHECK-NEXT: sbc.w r10, r5, r9 -; CHECK-NEXT: rsbs r2, r1, #0 ; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: lsll r0, r3, r2 -; CHECK-NEXT: vmov r6, r2, d4 ; CHECK-NEXT: lsll r0, r3, r12 -; CHECK-NEXT: asrs r3, r5, #31 ; CHECK-NEXT: adds r4, r5, r6 +; CHECK-NEXT: asr.w r3, r5, #31 ; CHECK-NEXT: adcs r3, r2 ; CHECK-NEXT: asrl r4, r3, r6 ; CHECK-NEXT: subs r4, r4, r6 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -851,15 +851,15 @@ ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: bpl .LBB18_5 ; CHECK-LE-NEXT: .LBB18_4: @ %cond.load7 -; CHECK-LE-NEXT: vldr.16 s4, [r2, #6] -; CHECK-LE-NEXT: vins.f16 s1, s4 +; CHECK-LE-NEXT: vldr.16 s2, [r2, #6] +; CHECK-LE-NEXT: vins.f16 s1, s2 ; CHECK-LE-NEXT: .LBB18_5: @ %else8 ; CHECK-LE-NEXT: vmrs r2, p0 ; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-LE-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-LE-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-LE-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-LE-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-LE-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-LE-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-LE-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-LE-NEXT: and r3, r2, #1 ; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 @@ -874,19 +874,19 @@ ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: vmovne r2, s4 +; CHECK-LE-NEXT: vmovne r2, s0 ; CHECK-LE-NEXT: strne r2, [r0] ; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r2, s5 +; CHECK-LE-NEXT: vmovmi r2, s1 ; CHECK-LE-NEXT: strmi r2, [r0, #4] ; CHECK-LE-NEXT: lsls r2, r1, #29 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r2, s6 +; CHECK-LE-NEXT: vmovmi r2, s2 ; CHECK-LE-NEXT: strmi r2, [r0, #8] ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r1, s7 +; CHECK-LE-NEXT: vmovmi r1, s3 ; CHECK-LE-NEXT: strmi r1, [r0, #12] ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r7, pc} @@ -895,14 +895,14 @@ ; CHECK-LE-NEXT: lsls r3, r1, #30 ; CHECK-LE-NEXT: bpl .LBB18_2 ; CHECK-LE-NEXT: .LBB18_7: @ %cond.load1 -; CHECK-LE-NEXT: vldr.16 s4, [r2, #2] -; CHECK-LE-NEXT: vins.f16 s0, s4 +; CHECK-LE-NEXT: vldr.16 s2, [r2, #2] +; CHECK-LE-NEXT: vins.f16 s0, s2 ; CHECK-LE-NEXT: lsls r3, r1, #29 ; CHECK-LE-NEXT: bpl .LBB18_3 ; CHECK-LE-NEXT: .LBB18_8: @ %cond.load4 -; CHECK-LE-NEXT: vmovx.f16 s4, s1 ; CHECK-LE-NEXT: vldr.16 s1, [r2, #4] -; CHECK-LE-NEXT: vins.f16 s1, s4 +; CHECK-LE-NEXT: vmovx.f16 s2, s0 +; CHECK-LE-NEXT: vins.f16 s1, s2 ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: bmi .LBB18_4 ; CHECK-LE-NEXT: b .LBB18_5 @@ -942,15 +942,15 @@ ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: beq .LBB18_5 ; CHECK-BE-NEXT: .LBB18_4: @ %cond.load7 -; CHECK-BE-NEXT: vldr.16 s4, [r2, #6] -; CHECK-BE-NEXT: vins.f16 s1, s4 +; CHECK-BE-NEXT: vldr.16 s2, [r2, #6] +; CHECK-BE-NEXT: vins.f16 s1, s2 ; CHECK-BE-NEXT: .LBB18_5: @ %else8 ; CHECK-BE-NEXT: vmrs r2, p0 ; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-BE-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-BE-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-BE-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-BE-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-BE-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-BE-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-BE-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 ; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: bfi r1, r3, #0, #1 @@ -965,19 +965,19 @@ ; CHECK-BE-NEXT: bfi r1, r2, #3, #1 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s4 +; CHECK-BE-NEXT: vmovmi r2, s0 ; CHECK-BE-NEXT: strmi r2, [r0] ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s5 +; CHECK-BE-NEXT: vmovmi r2, s1 ; CHECK-BE-NEXT: strmi r2, [r0, #4] ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s6 +; CHECK-BE-NEXT: vmovmi r2, s2 ; CHECK-BE-NEXT: strmi r2, [r0, #8] ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: itt ne -; CHECK-BE-NEXT: vmovne r1, s7 +; CHECK-BE-NEXT: vmovne r1, s3 ; CHECK-BE-NEXT: strne r1, [r0, #12] ; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r7, pc} @@ -986,14 +986,14 @@ ; CHECK-BE-NEXT: lsls r3, r1, #29 ; CHECK-BE-NEXT: bpl .LBB18_2 ; CHECK-BE-NEXT: .LBB18_7: @ %cond.load1 -; CHECK-BE-NEXT: vldr.16 s4, [r2, #2] -; CHECK-BE-NEXT: vins.f16 s0, s4 +; CHECK-BE-NEXT: vldr.16 s2, [r2, #2] +; CHECK-BE-NEXT: vins.f16 s0, s2 ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB18_3 ; CHECK-BE-NEXT: .LBB18_8: @ %cond.load4 -; CHECK-BE-NEXT: vmovx.f16 s4, s1 ; CHECK-BE-NEXT: vldr.16 s1, [r2, #4] -; CHECK-BE-NEXT: vins.f16 s1, s4 +; CHECK-BE-NEXT: vmovx.f16 s2, s0 +; CHECK-BE-NEXT: vins.f16 s1, s2 ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: bne .LBB18_4 ; CHECK-BE-NEXT: b .LBB18_5 @@ -1042,15 +1042,15 @@ ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: bpl .LBB19_5 ; CHECK-LE-NEXT: .LBB19_4: @ %cond.load7 -; CHECK-LE-NEXT: vldr.16 s4, [r2, #6] -; CHECK-LE-NEXT: vins.f16 s1, s4 +; CHECK-LE-NEXT: vldr.16 s2, [r2, #6] +; CHECK-LE-NEXT: vins.f16 s1, s2 ; CHECK-LE-NEXT: .LBB19_5: @ %else8 ; CHECK-LE-NEXT: vmrs r2, p0 ; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-LE-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-LE-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-LE-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-LE-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-LE-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-LE-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-LE-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-LE-NEXT: and r3, r2, #1 ; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 @@ -1065,19 +1065,19 @@ ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: vmovne r2, s4 +; CHECK-LE-NEXT: vmovne r2, s0 ; CHECK-LE-NEXT: strne r2, [r0] ; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r2, s5 +; CHECK-LE-NEXT: vmovmi r2, s1 ; CHECK-LE-NEXT: strmi r2, [r0, #4] ; CHECK-LE-NEXT: lsls r2, r1, #29 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r2, s6 +; CHECK-LE-NEXT: vmovmi r2, s2 ; CHECK-LE-NEXT: strmi r2, [r0, #8] ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r1, s7 +; CHECK-LE-NEXT: vmovmi r1, s3 ; CHECK-LE-NEXT: strmi r1, [r0, #12] ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r7, pc} @@ -1086,14 +1086,14 @@ ; CHECK-LE-NEXT: lsls r3, r1, #30 ; CHECK-LE-NEXT: bpl .LBB19_2 ; CHECK-LE-NEXT: .LBB19_7: @ %cond.load1 -; CHECK-LE-NEXT: vldr.16 s4, [r2, #2] -; CHECK-LE-NEXT: vins.f16 s0, s4 +; CHECK-LE-NEXT: vldr.16 s2, [r2, #2] +; CHECK-LE-NEXT: vins.f16 s0, s2 ; CHECK-LE-NEXT: lsls r3, r1, #29 ; CHECK-LE-NEXT: bpl .LBB19_3 ; CHECK-LE-NEXT: .LBB19_8: @ %cond.load4 -; CHECK-LE-NEXT: vmovx.f16 s4, s1 ; CHECK-LE-NEXT: vldr.16 s1, [r2, #4] -; CHECK-LE-NEXT: vins.f16 s1, s4 +; CHECK-LE-NEXT: vmovx.f16 s2, s0 +; CHECK-LE-NEXT: vins.f16 s1, s2 ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: bmi .LBB19_4 ; CHECK-LE-NEXT: b .LBB19_5 @@ -1133,15 +1133,15 @@ ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: beq .LBB19_5 ; CHECK-BE-NEXT: .LBB19_4: @ %cond.load7 -; CHECK-BE-NEXT: vldr.16 s4, [r2, #6] -; CHECK-BE-NEXT: vins.f16 s1, s4 +; CHECK-BE-NEXT: vldr.16 s2, [r2, #6] +; CHECK-BE-NEXT: vins.f16 s1, s2 ; CHECK-BE-NEXT: .LBB19_5: @ %else8 ; CHECK-BE-NEXT: vmrs r2, p0 ; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-BE-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-BE-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-BE-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-BE-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-BE-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-BE-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-BE-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 ; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: bfi r1, r3, #0, #1 @@ -1156,19 +1156,19 @@ ; CHECK-BE-NEXT: bfi r1, r2, #3, #1 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s4 +; CHECK-BE-NEXT: vmovmi r2, s0 ; CHECK-BE-NEXT: strmi r2, [r0] ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s5 +; CHECK-BE-NEXT: vmovmi r2, s1 ; CHECK-BE-NEXT: strmi r2, [r0, #4] ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s6 +; CHECK-BE-NEXT: vmovmi r2, s2 ; CHECK-BE-NEXT: strmi r2, [r0, #8] ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: itt ne -; CHECK-BE-NEXT: vmovne r1, s7 +; CHECK-BE-NEXT: vmovne r1, s3 ; CHECK-BE-NEXT: strne r1, [r0, #12] ; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r7, pc} @@ -1177,14 +1177,14 @@ ; CHECK-BE-NEXT: lsls r3, r1, #29 ; CHECK-BE-NEXT: bpl .LBB19_2 ; CHECK-BE-NEXT: .LBB19_7: @ %cond.load1 -; CHECK-BE-NEXT: vldr.16 s4, [r2, #2] -; CHECK-BE-NEXT: vins.f16 s0, s4 +; CHECK-BE-NEXT: vldr.16 s2, [r2, #2] +; CHECK-BE-NEXT: vins.f16 s0, s2 ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB19_3 ; CHECK-BE-NEXT: .LBB19_8: @ %cond.load4 -; CHECK-BE-NEXT: vmovx.f16 s4, s1 ; CHECK-BE-NEXT: vldr.16 s1, [r2, #4] -; CHECK-BE-NEXT: vins.f16 s1, s4 +; CHECK-BE-NEXT: vmovx.f16 s2, s0 +; CHECK-BE-NEXT: vins.f16 s1, s2 ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: bne .LBB19_4 ; CHECK-BE-NEXT: b .LBB19_5 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -108,8 +108,8 @@ ; CHECK-LE-LABEL: masked_v4i32_pre: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr ; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr @@ -117,8 +117,8 @@ ; CHECK-BE-LABEL: masked_v4i32_pre: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr ; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]! @@ -137,8 +137,8 @@ ; CHECK-LE-LABEL: masked_v4i32_post: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr ; CHECK-LE-NEXT: vstrwt.32 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr @@ -146,8 +146,8 @@ ; CHECK-BE-LABEL: masked_v4i32_post: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr ; CHECK-BE-NEXT: vstrwt.32 q1, [r0], #4 @@ -327,8 +327,8 @@ ; CHECK-LE-LABEL: masked_v8i16_pre: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr ; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr @@ -336,8 +336,8 @@ ; CHECK-BE-LABEL: masked_v8i16_pre: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr ; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]! @@ -356,8 +356,8 @@ ; CHECK-LE-LABEL: masked_v8i16_post: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr ; CHECK-LE-NEXT: vstrht.16 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr @@ -365,8 +365,8 @@ ; CHECK-BE-LABEL: masked_v8i16_post: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr ; CHECK-BE-NEXT: vstrht.16 q1, [r0], #4 @@ -405,8 +405,8 @@ ; CHECK-LE-LABEL: masked_v16i8_pre: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr ; CHECK-LE-NEXT: vstrbt.8 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr @@ -414,8 +414,8 @@ ; CHECK-BE-LABEL: masked_v16i8_pre: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrb.u8 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrb.u8 q1, [r1] ; CHECK-BE-NEXT: vrev64.8 q2, q0 ; CHECK-BE-NEXT: vpt.s8 gt, q2, zr ; CHECK-BE-NEXT: vstrbt.8 q1, [r0, #4]! @@ -434,8 +434,8 @@ ; CHECK-LE-LABEL: masked_v16i8_post: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr ; CHECK-LE-NEXT: vstrbt.8 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr @@ -443,8 +443,8 @@ ; CHECK-BE-LABEL: masked_v16i8_post: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrb.u8 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrb.u8 q1, [r1] ; CHECK-BE-NEXT: vrev64.8 q2, q0 ; CHECK-BE-NEXT: vpt.s8 gt, q2, zr ; CHECK-BE-NEXT: vstrbt.8 q1, [r0], #4 @@ -568,8 +568,8 @@ ; CHECK-LE-LABEL: masked_v4f32_pre: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr ; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr @@ -577,8 +577,8 @@ ; CHECK-BE-LABEL: masked_v4f32_pre: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr ; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]! @@ -597,8 +597,8 @@ ; CHECK-LE-LABEL: masked_v4f32_post: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr ; CHECK-LE-NEXT: vstrwt.32 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr @@ -606,8 +606,8 @@ ; CHECK-BE-LABEL: masked_v4f32_post: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr ; CHECK-BE-NEXT: vstrwt.32 q1, [r0], #4 @@ -709,8 +709,8 @@ ; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: bpl .LBB16_2 ; CHECK-LE-NEXT: .LBB16_10: @ %cond.store1 -; CHECK-LE-NEXT: vmovx.f16 s4, s0 -; CHECK-LE-NEXT: vstr.16 s4, [sp, #24] +; CHECK-LE-NEXT: vmovx.f16 s0, s0 +; CHECK-LE-NEXT: vstr.16 s0, [sp, #24] ; CHECK-LE-NEXT: ldrh.w r2, [sp, #24] ; CHECK-LE-NEXT: strh r2, [r0, #2] ; CHECK-LE-NEXT: lsls r2, r1, #29 @@ -722,8 +722,8 @@ ; CHECK-LE-NEXT: lsls r2, r1, #28 ; CHECK-LE-NEXT: bpl .LBB16_4 ; CHECK-LE-NEXT: .LBB16_12: @ %cond.store5 -; CHECK-LE-NEXT: vmovx.f16 s4, s1 -; CHECK-LE-NEXT: vstr.16 s4, [sp, #16] +; CHECK-LE-NEXT: vmovx.f16 s0, s1 +; CHECK-LE-NEXT: vstr.16 s0, [sp, #16] ; CHECK-LE-NEXT: ldrh.w r2, [sp, #16] ; CHECK-LE-NEXT: strh r2, [r0, #6] ; CHECK-LE-NEXT: lsls r2, r1, #27 @@ -735,8 +735,8 @@ ; CHECK-LE-NEXT: lsls r2, r1, #26 ; CHECK-LE-NEXT: bpl .LBB16_6 ; CHECK-LE-NEXT: .LBB16_14: @ %cond.store9 -; CHECK-LE-NEXT: vmovx.f16 s4, s2 -; CHECK-LE-NEXT: vstr.16 s4, [sp, #8] +; CHECK-LE-NEXT: vmovx.f16 s0, s2 +; CHECK-LE-NEXT: vstr.16 s0, [sp, #8] ; CHECK-LE-NEXT: ldrh.w r2, [sp, #8] ; CHECK-LE-NEXT: strh r2, [r0, #10] ; CHECK-LE-NEXT: lsls r2, r1, #25 @@ -877,8 +877,8 @@ ; CHECK-LE-LABEL: masked_v8f16_pre: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr ; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr @@ -886,8 +886,8 @@ ; CHECK-BE-LABEL: masked_v8f16_pre: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr ; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]! @@ -906,8 +906,8 @@ ; CHECK-LE-LABEL: masked_v8f16_post: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr ; CHECK-LE-NEXT: vstrht.16 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr @@ -915,8 +915,8 @@ ; CHECK-BE-LABEL: masked_v8f16_post: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr ; CHECK-BE-NEXT: vstrht.16 q1, [r0], #4 @@ -1253,12 +1253,12 @@ ; CHECK-LE-NEXT: it gt ; CHECK-LE-NEXT: movgt r2, #1 ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 ; CHECK-LE-NEXT: csetm r2, ne -; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 +; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 +; CHECK-LE-NEXT: vcvtb.f16.f32 s6, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: bne .LBB25_5 @@ -1328,12 +1328,12 @@ ; CHECK-BE-NEXT: it gt ; CHECK-BE-NEXT: movgt r2, #1 ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r3, #2, #1 ; CHECK-BE-NEXT: csetm r2, ne -; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r2, #3, #1 -; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 +; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s2, s6 ; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: bmi .LBB25_5 @@ -1354,8 +1354,8 @@ ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: bpl .LBB25_2 ; CHECK-BE-NEXT: .LBB25_6: @ %cond.store1 -; CHECK-BE-NEXT: vmovx.f16 s4, s0 -; CHECK-BE-NEXT: vstr.16 s4, [r0, #2] +; CHECK-BE-NEXT: vmovx.f16 s0, s0 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #2] ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: bpl .LBB25_3 ; CHECK-BE-NEXT: .LBB25_7: @ %cond.store3 @@ -1409,12 +1409,12 @@ ; CHECK-LE-NEXT: it gt ; CHECK-LE-NEXT: movgt r2, #1 ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 ; CHECK-LE-NEXT: csetm r2, ne -; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 +; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 +; CHECK-LE-NEXT: vcvtb.f16.f32 s6, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: bne .LBB26_5 @@ -1484,12 +1484,12 @@ ; CHECK-BE-NEXT: it gt ; CHECK-BE-NEXT: movgt r2, #1 ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r3, #2, #1 ; CHECK-BE-NEXT: csetm r2, ne -; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r2, #3, #1 -; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 +; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s2, s6 ; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: bmi .LBB26_5 @@ -1510,8 +1510,8 @@ ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: bpl .LBB26_2 ; CHECK-BE-NEXT: .LBB26_6: @ %cond.store1 -; CHECK-BE-NEXT: vmovx.f16 s4, s0 -; CHECK-BE-NEXT: vstr.16 s4, [r0, #2] +; CHECK-BE-NEXT: vmovx.f16 s0, s0 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #2] ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: bpl .LBB26_3 ; CHECK-BE-NEXT: .LBB26_7: @ %cond.store3 @@ -1565,12 +1565,12 @@ ; CHECK-LE-NEXT: it gt ; CHECK-LE-NEXT: movgt r2, #1 ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 ; CHECK-LE-NEXT: csetm r2, ne -; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 +; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 +; CHECK-LE-NEXT: vcvtb.f16.f32 s6, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: bne .LBB27_5 @@ -1648,12 +1648,12 @@ ; CHECK-BE-NEXT: it gt ; CHECK-BE-NEXT: movgt r2, #1 ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r3, #2, #1 ; CHECK-BE-NEXT: csetm r2, ne -; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r2, #3, #1 -; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 +; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s2, s6 ; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: bmi .LBB27_5 @@ -1676,8 +1676,8 @@ ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: bpl .LBB27_2 ; CHECK-BE-NEXT: .LBB27_6: @ %cond.store1 -; CHECK-BE-NEXT: vmovx.f16 s4, s0 -; CHECK-BE-NEXT: vstr.16 s4, [sp, #8] +; CHECK-BE-NEXT: vmovx.f16 s0, s0 +; CHECK-BE-NEXT: vstr.16 s0, [sp, #8] ; CHECK-BE-NEXT: ldrh.w r2, [sp, #8] ; CHECK-BE-NEXT: strh r2, [r0, #2] ; CHECK-BE-NEXT: lsls r2, r1, #30 diff --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll --- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll @@ -279,11 +279,10 @@ define arm_aapcs_vfpcc <4 x float> @maxnm_float32_t(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: maxnm_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmaxnm.f32 s11, s7, s3 -; CHECK-MVE-NEXT: vmaxnm.f32 s10, s6, s2 -; CHECK-MVE-NEXT: vmaxnm.f32 s9, s5, s1 -; CHECK-MVE-NEXT: vmaxnm.f32 s8, s4, s0 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vmaxnm.f32 s3, s7, s3 +; CHECK-MVE-NEXT: vmaxnm.f32 s2, s6, s2 +; CHECK-MVE-NEXT: vmaxnm.f32 s1, s5, s1 +; CHECK-MVE-NEXT: vmaxnm.f32 s0, s4, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: maxnm_float32_t: @@ -299,27 +298,26 @@ define arm_aapcs_vfpcc <8 x half> @minnm_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: minnm_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vmovx.f16 s2, s4 -; CHECK-MVE-NEXT: vmovx.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vminnm.f16 s12, s2, s0 -; CHECK-MVE-NEXT: vminnm.f16 s0, s4, s8 -; CHECK-MVE-NEXT: vins.f16 s0, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vminnm.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vins.f16 s1, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vminnm.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vins.f16 s2, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vminnm.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vins.f16 s3, s12 +; CHECK-MVE-NEXT: vmovx.f16 s8, s0 +; CHECK-MVE-NEXT: vmovx.f16 s10, s4 +; CHECK-MVE-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-MVE-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vminnm.f16 s1, s5, s1 +; CHECK-MVE-NEXT: vminnm.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vminnm.f16 s2, s6, s2 +; CHECK-MVE-NEXT: vminnm.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 +; CHECK-MVE-NEXT: vminnm.f16 s3, s7, s3 +; CHECK-MVE-NEXT: vminnm.f16 s4, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: minnm_float16_t: diff --git a/llvm/test/CodeGen/Thumb2/mve-nofloat.ll b/llvm/test/CodeGen/Thumb2/mve-nofloat.ll --- a/llvm/test/CodeGen/Thumb2/mve-nofloat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-nofloat.ll @@ -104,20 +104,20 @@ ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: .save {r4, r5, r7, lr} ; CHECK-NOFP-NEXT: push {r4, r5, r7, lr} -; CHECK-NOFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NOFP-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NOFP-NEXT: vmov q5, q1 -; CHECK-NOFP-NEXT: vmov q6, q0 -; CHECK-NOFP-NEXT: vmov r4, r0, d13 -; CHECK-NOFP-NEXT: vmov r5, r1, d11 +; CHECK-NOFP-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NOFP-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NOFP-NEXT: vmov q4, q1 +; CHECK-NOFP-NEXT: vmov q5, q0 +; CHECK-NOFP-NEXT: vmov r4, r0, d11 +; CHECK-NOFP-NEXT: vmov r5, r1, d9 ; CHECK-NOFP-NEXT: bl __aeabi_fadd ; CHECK-NOFP-NEXT: vmov s19, r0 ; CHECK-NOFP-NEXT: mov r0, r4 ; CHECK-NOFP-NEXT: mov r1, r5 ; CHECK-NOFP-NEXT: bl __aeabi_fadd ; CHECK-NOFP-NEXT: vmov s18, r0 -; CHECK-NOFP-NEXT: vmov r4, r0, d12 -; CHECK-NOFP-NEXT: vmov r5, r1, d10 +; CHECK-NOFP-NEXT: vmov r4, r0, d10 +; CHECK-NOFP-NEXT: vmov r5, r1, d8 ; CHECK-NOFP-NEXT: bl __aeabi_fadd ; CHECK-NOFP-NEXT: vmov s17, r0 ; CHECK-NOFP-NEXT: mov r0, r4 @@ -125,7 +125,7 @@ ; CHECK-NOFP-NEXT: bl __aeabi_fadd ; CHECK-NOFP-NEXT: vmov s16, r0 ; CHECK-NOFP-NEXT: vmov q0, q4 -; CHECK-NOFP-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NOFP-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NOFP-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-FP-LABEL: vector_add_f32: diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -152,40 +152,39 @@ ; CHECK-NEXT: .pad #408 ; CHECK-NEXT: sub sp, #408 ; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals -; CHECK-NEXT: vldr s12, .LCPI1_0 -; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals ; CHECK-NEXT: vldr s15, .LCPI1_1 -; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: mov r4, r7 -; CHECK-NEXT: ldr r0, [r3, #4]! +; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals ; CHECK-NEXT: movw r2, :lower16:e +; CHECK-NEXT: mov r4, r7 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: ldr r6, [r4, #8]! -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: vmov s13, r3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: movt r2, :upper16:e +; CHECK-NEXT: ldr r0, [r3, #4]! ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: movt r2, :upper16:e +; CHECK-NEXT: vmov r5, s15 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r4 -; CHECK-NEXT: vmov s21, r2 -; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmov s13, r3 +; CHECK-NEXT: vldr s12, .LCPI1_0 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r2 -; CHECK-NEXT: vmov.f32 s20, s12 ; CHECK-NEXT: vdup.32 q7, r3 ; CHECK-NEXT: vmov q6[2], q6[0], r3, r5 -; CHECK-NEXT: vmov.f32 s22, s13 ; CHECK-NEXT: vstrw.32 q0, [sp, #92] ; CHECK-NEXT: vmov q0, q7 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vmov q4, q7 ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.32 q7[1], r2 -; CHECK-NEXT: vmov.f32 s23, s15 +; CHECK-NEXT: vmov s21, r2 ; CHECK-NEXT: movs r1, #64 +; CHECK-NEXT: vmov.f32 s20, s12 ; CHECK-NEXT: str r0, [sp, #40] -; CHECK-NEXT: vstrw.32 q5, [r0] +; CHECK-NEXT: vmov.f32 s22, s13 ; CHECK-NEXT: str r6, [r0] -; CHECK-NEXT: vstrw.32 q7, [r0] +; CHECK-NEXT: vmov.f32 s23, s15 ; CHECK-NEXT: str r0, [r0] +; CHECK-NEXT: vstrw.32 q5, [r0] +; CHECK-NEXT: vstrw.32 q7, [r0] ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q6, [r0] ; CHECK-NEXT: mov.w r8, #0 @@ -193,6 +192,7 @@ ; CHECK-NEXT: vmov q2[2], q2[0], r3, r3 ; CHECK-NEXT: mov.w r12, #4 ; CHECK-NEXT: vmov q1[3], q1[1], r2, r4 +; CHECK-NEXT: vmov.f32 s14, s13 ; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 ; CHECK-NEXT: vmov.32 q4[0], r8 ; CHECK-NEXT: @ implicit-def: $r2 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -34,13 +34,13 @@ ; CHECK-NEXT: letp lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: vadd.f32 s4, s2, s3 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: add.w r7, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: adds r0, #1 ; CHECK-NEXT: add r3, r9 ; CHECK-NEXT: cmp r0, r12 -; CHECK-NEXT: vadd.f32 s0, s0, s4 +; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: vstr s0, [r7] ; CHECK-NEXT: bne .LBB0_2 ; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup @@ -138,15 +138,15 @@ ; CHECK-NEXT: letp lr, .LBB1_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: vadd.f32 s8, s2, s3 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: add.w r0, r2, r9, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: add r11, r10 -; CHECK-NEXT: vadd.f32 s2, s6, s7 +; CHECK-NEXT: vadd.f32 s6, s6, s7 ; CHECK-NEXT: add r6, r10 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s0, s0, s8 -; CHECK-NEXT: vadd.f32 s2, s4, s2 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s2, s4, s6 ; CHECK-NEXT: vstr s0, [r0] ; CHECK-NEXT: add.w r0, r2, r4, lsl #2 ; CHECK-NEXT: adds r4, #2 @@ -279,21 +279,21 @@ ; CHECK-NEXT: letp lr, .LBB2_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 -; CHECK-NEXT: vadd.f32 s12, s10, s11 +; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: adds r0, r5, #1 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: add r10, r11 -; CHECK-NEXT: vadd.f32 s10, s6, s7 +; CHECK-NEXT: vadd.f32 s6, s6, s7 ; CHECK-NEXT: add.w r0, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: add r12, r11 -; CHECK-NEXT: vadd.f32 s6, s2, s3 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: add r8, r11 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s2, s8, s12 -; CHECK-NEXT: vadd.f32 s4, s4, s10 -; CHECK-NEXT: vadd.f32 s0, s0, s6 -; CHECK-NEXT: vstr s2, [r0] +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s4, s4, s6 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vstr s8, [r0] ; CHECK-NEXT: add.w r0, r2, r5, lsl #2 ; CHECK-NEXT: vstr s4, [r0] ; CHECK-NEXT: adds r0, r5, #2 @@ -450,22 +450,22 @@ ; CHECK-NEXT: letp lr, .LBB3_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 -; CHECK-NEXT: vadd.f32 s16, s14, s15 +; CHECK-NEXT: vadd.f32 s14, s14, s15 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: adds r0, r6, #1 -; CHECK-NEXT: vadd.f32 s14, s10, s11 +; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s10, s6, s7 +; CHECK-NEXT: vadd.f32 s6, s6, s7 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s6, s2, s3 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s2, s12, s16 -; CHECK-NEXT: vadd.f32 s8, s8, s14 -; CHECK-NEXT: vadd.f32 s4, s4, s10 -; CHECK-NEXT: vadd.f32 s0, s0, s6 -; CHECK-NEXT: vstr s2, [r0] +; CHECK-NEXT: vadd.f32 s12, s12, s14 +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s4, s4, s6 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vstr s12, [r0] ; CHECK-NEXT: add.w r0, r1, r6, lsl #2 ; CHECK-NEXT: vstr s8, [r0] ; CHECK-NEXT: adds r0, r6, #2 @@ -645,26 +645,26 @@ ; CHECK-NEXT: letp lr, .LBB4_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1 -; CHECK-NEXT: vadd.f32 s20, s18, s19 +; CHECK-NEXT: vadd.f32 s18, s18, s19 ; CHECK-NEXT: add.w r3, r2, r11, lsl #2 ; CHECK-NEXT: vadd.f32 s16, s16, s17 -; CHECK-NEXT: vadd.f32 s18, s14, s15 +; CHECK-NEXT: vadd.f32 s14, s14, s15 ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vadd.f32 s14, s6, s7 +; CHECK-NEXT: vadd.f32 s6, s6, s7 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s6, s10, s11 +; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s10, s2, s3 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s2, s16, s20 -; CHECK-NEXT: vadd.f32 s12, s12, s18 -; CHECK-NEXT: vadd.f32 s4, s4, s14 -; CHECK-NEXT: vadd.f32 s6, s8, s6 -; CHECK-NEXT: vadd.f32 s0, s0, s10 -; CHECK-NEXT: vstr s2, [r3] +; CHECK-NEXT: vadd.f32 s1, s16, s18 +; CHECK-NEXT: vadd.f32 s12, s12, s14 +; CHECK-NEXT: vadd.f32 s2, s2, s3 +; CHECK-NEXT: vadd.f32 s4, s4, s6 +; CHECK-NEXT: vadd.f32 s6, s8, s10 +; CHECK-NEXT: vstr s1, [r3] ; CHECK-NEXT: add.w r3, r2, r0, lsl #2 ; CHECK-NEXT: vstr s12, [r3] ; CHECK-NEXT: adds r3, r0, #2 +; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: add.w r3, r2, r3, lsl #2 ; CHECK-NEXT: vstr s6, [r3] ; CHECK-NEXT: adds r3, r0, #3 @@ -858,32 +858,32 @@ ; CHECK-NEXT: letp lr, .LBB5_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1 -; CHECK-NEXT: vadd.f32 s24, s22, s23 +; CHECK-NEXT: vadd.f32 s22, s22, s23 ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vadd.f32 s20, s20, s21 -; CHECK-NEXT: vadd.f32 s22, s18, s19 +; CHECK-NEXT: vadd.f32 s18, s18, s19 ; CHECK-NEXT: vadd.f32 s16, s16, s17 -; CHECK-NEXT: vadd.f32 s18, s6, s7 -; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s6, s14, s15 -; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vadd.f32 s14, s10, s11 +; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s10, s2, s3 -; CHECK-NEXT: vadd.f32 s2, s20, s24 -; CHECK-NEXT: vadd.f32 s1, s16, s22 -; CHECK-NEXT: vadd.f32 s6, s12, s6 -; CHECK-NEXT: vadd.f32 s4, s4, s18 -; CHECK-NEXT: vadd.f32 s8, s8, s14 -; CHECK-NEXT: vadd.f32 s0, s0, s10 -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: add.w r1, r2, r0, lsl #2 +; CHECK-NEXT: vadd.f32 s2, s2, s3 +; CHECK-NEXT: vadd.f32 s1, s20, s22 +; CHECK-NEXT: vadd.f32 s6, s6, s7 +; CHECK-NEXT: vadd.f32 s3, s16, s18 +; CHECK-NEXT: vadd.f32 s4, s4, s5 +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s14, s14, s15 +; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vstr s1, [r1] +; CHECK-NEXT: add.w r1, r2, r0, lsl #2 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vstr s3, [r1] ; CHECK-NEXT: adds r1, r0, #2 +; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] ; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vadd.f32 s6, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: adds r1, r0, #4 @@ -1089,19 +1089,17 @@ ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vmov q5, q4 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r7] ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q1, q0, q7 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov q1, q3 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload @@ -1122,32 +1120,32 @@ ; CHECK-NEXT: vadd.f32 s0, s26, s27 ; CHECK-NEXT: add.w r1, r2, r12, lsl #2 ; CHECK-NEXT: vadd.f32 s2, s24, s25 -; CHECK-NEXT: vadd.f32 s3, s20, s21 ; CHECK-NEXT: vadd.f32 s1, s22, s23 -; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s20, s10, s11 -; CHECK-NEXT: vadd.f32 s11, s14, s15 -; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vadd.f32 s14, s6, s7 +; CHECK-NEXT: vadd.f32 s3, s20, s21 +; CHECK-NEXT: vadd.f32 s6, s6, s7 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s10, s18, s19 -; CHECK-NEXT: vadd.f32 s9, s16, s17 +; CHECK-NEXT: vadd.f32 s10, s10, s11 +; CHECK-NEXT: vadd.f32 s8, s8, s9 +; CHECK-NEXT: vadd.f32 s9, s18, s19 +; CHECK-NEXT: vadd.f32 s11, s16, s17 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vadd.f32 s6, s18, s19 -; CHECK-NEXT: vadd.f32 s5, s16, s17 +; CHECK-NEXT: vadd.f32 s5, s18, s19 +; CHECK-NEXT: vadd.f32 s7, s16, s17 ; CHECK-NEXT: vadd.f32 s2, s3, s1 -; CHECK-NEXT: vadd.f32 s4, s4, s14 -; CHECK-NEXT: vadd.f32 s12, s12, s11 -; CHECK-NEXT: vadd.f32 s10, s9, s10 +; CHECK-NEXT: vadd.f32 s4, s4, s6 +; CHECK-NEXT: vadd.f32 s14, s14, s15 +; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s8, s8, s20 -; CHECK-NEXT: vadd.f32 s6, s5, s6 +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s6, s7, s5 ; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 +; CHECK-NEXT: vadd.f32 s10, s11, s9 ; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vadd.f32 s12, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] ; CHECK-NEXT: adds r1, r0, #4 @@ -1408,33 +1406,33 @@ ; CHECK-NEXT: vadd.f32 s0, s30, s31 ; CHECK-NEXT: add.w r1, r2, r8, lsl #2 ; CHECK-NEXT: vadd.f32 s2, s28, s29 -; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vadd.f32 s5, s14, s15 ; CHECK-NEXT: vadd.f32 s4, s26, s27 ; CHECK-NEXT: vadd.f32 s6, s24, s25 -; CHECK-NEXT: vadd.f32 s14, s18, s19 +; CHECK-NEXT: vadd.f32 s5, s18, s19 ; CHECK-NEXT: vadd.f32 s7, s16, s17 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s13, s10, s11 -; CHECK-NEXT: vadd.f32 s10, s18, s19 -; CHECK-NEXT: vadd.f32 s9, s16, s17 +; CHECK-NEXT: vadd.f32 s9, s18, s19 +; CHECK-NEXT: vadd.f32 s11, s16, s17 ; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vadd.f32 s11, s18, s19 +; CHECK-NEXT: vadd.f32 s14, s14, s15 +; CHECK-NEXT: vadd.f32 s12, s12, s13 +; CHECK-NEXT: vadd.f32 s13, s18, s19 ; CHECK-NEXT: vadd.f32 s15, s16, s17 +; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s2, s6, s4 -; CHECK-NEXT: vadd.f32 s6, s12, s5 -; CHECK-NEXT: vadd.f32 s12, s7, s14 -; CHECK-NEXT: vadd.f32 s10, s9, s10 +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s10, s11, s9 +; CHECK-NEXT: vadd.f32 s6, s12, s14 +; CHECK-NEXT: vadd.f32 s1, s22, s23 +; CHECK-NEXT: vadd.f32 s14, s15, s13 ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s8, s8, s13 -; CHECK-NEXT: vadd.f32 s14, s15, s11 +; CHECK-NEXT: vadd.f32 s3, s20, s21 ; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: add.w r1, r2, r12, lsl #2 -; CHECK-NEXT: vadd.f32 s1, s22, s23 -; CHECK-NEXT: vadd.f32 s3, s20, s21 +; CHECK-NEXT: vadd.f32 s12, s7, s5 ; CHECK-NEXT: vstr s10, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vstr s14, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll @@ -159,11 +159,11 @@ ; CHECK-LABEL: build_var0_v2i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: vldr s10, .LCPI9_0 ; CHECK-NEXT: csetm r0, lo ; CHECK-NEXT: vmov s8, r0 -; CHECK-NEXT: vldr s10, .LCPI9_0 -; CHECK-NEXT: vmov.f32 s9, s8 ; CHECK-NEXT: vmov.f32 s11, s10 +; CHECK-NEXT: vmov.f32 s9, s8 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 @@ -183,9 +183,9 @@ ; CHECK-LABEL: build_var1_v2i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: vldr s8, .LCPI10_0 ; CHECK-NEXT: csetm r0, lo ; CHECK-NEXT: vmov s10, r0 -; CHECK-NEXT: vldr s8, .LCPI10_0 ; CHECK-NEXT: vmov.f32 s9, s8 ; CHECK-NEXT: vmov.f32 s11, s10 ; CHECK-NEXT: vbic q1, q1, q2 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll @@ -79,9 +79,9 @@ define <4 x i32> @shuffle2_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: shuffle2_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vmov d0, r0, r1 ; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vcmp.i32 eq, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp @@ -100,9 +100,9 @@ define <8 x i16> @shuffle2_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: shuffle2_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vmov d0, r0, r1 ; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vcmp.i16 eq, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp @@ -121,9 +121,9 @@ define <16 x i8> @shuffle2_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: shuffle2_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vmov d0, r0, r1 ; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vcmp.i8 eq, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp @@ -223,9 +223,9 @@ define <4 x i32> @shuffle4_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: shuffle4_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: vmov.i8 q1, #0xff ; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: vcmp.i32 eq, q0, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -253,16 +253,15 @@ ; CHECK-NEXT: mov.w r2, #-1 ; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: vmov.f32 s20, s14 ; CHECK-NEXT: vmov.f32 s18, s11 +; CHECK-NEXT: vmov.f32 s20, s14 ; CHECK-NEXT: vmov.f32 s22, s15 ; CHECK-NEXT: vmullb.s32 q6, q5, q4 -; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmov.f32 s10, s9 ; CHECK-NEXT: vmov r4, r7, d13 ; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: vmov.f32 s10, s9 -; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 ; CHECK-NEXT: vmov r6, s12 +; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 ; CHECK-NEXT: sbcs.w r5, r2, r7 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lt @@ -306,10 +305,11 @@ ; CHECK-NEXT: csetm r4, ne ; CHECK-NEXT: vmov q5[2], q5[0], r3, r4 ; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov.f32 s10, s13 ; CHECK-NEXT: vbic q6, q1, q5 ; CHECK-NEXT: vand q4, q4, q5 ; CHECK-NEXT: vorr q4, q4, q6 +; CHECK-NEXT: vmov r4, s10 ; CHECK-NEXT: smull r6, r5, r6, r5 ; CHECK-NEXT: asrl r6, r5, #31 ; CHECK-NEXT: smull r4, r7, r4, r3 @@ -522,17 +522,15 @@ ; CHECK-NEXT: vorr q4, q4, q0 ; CHECK-NEXT: vpt.u32 cs, q1, q4 ; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 -; CHECK-NEXT: vmov.f32 s24, s18 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q5, [r1], #16 -; CHECK-NEXT: vmov.f32 s28, s22 +; CHECK-NEXT: vmov.f32 s24, s18 ; CHECK-NEXT: vmov.f32 s26, s19 +; CHECK-NEXT: vmov.f32 s28, s22 ; CHECK-NEXT: vmov.f32 s30, s23 ; CHECK-NEXT: vmullb.s32 q0, q7, q6 -; CHECK-NEXT: vmov.f32 s18, s17 ; CHECK-NEXT: vmov r6, r5, d1 ; CHECK-NEXT: asrl r6, r5, #31 -; CHECK-NEXT: vmov.f32 s22, s21 ; CHECK-NEXT: rsbs.w r7, r6, #-2147483648 ; CHECK-NEXT: sbcs.w r7, r12, r5 ; CHECK-NEXT: mov.w r7, #0 @@ -575,11 +573,13 @@ ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: csetm r4, ne ; CHECK-NEXT: vmov q0[2], q0[0], r3, r4 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s22 ; CHECK-NEXT: vbic q7, q3, q0 ; CHECK-NEXT: vand q0, q6, q0 ; CHECK-NEXT: vorr q6, q0, q7 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.f32 s2, s21 +; CHECK-NEXT: vmov r4, s2 ; CHECK-NEXT: smull r6, r5, r4, r3 ; CHECK-NEXT: vmov r4, s16 ; CHECK-NEXT: asrl r6, r5, #31 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -93,23 +93,23 @@ ; CHECK-LABEL: scaled_v8f16_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vshl.i32 q2, q1, #1 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vmov r1, r2, d4 +; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vstr.16 s12, [r2] +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r1, r2, d5 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vstr.16 s8, [r2] +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vstr.16 s3, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll @@ -111,20 +111,20 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q2, [r1] ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vstr.16 s12, [r2] +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r1, r2, d5 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vstr.16 s8, [r2] +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vstr.16 s3, [r0] @@ -184,20 +184,20 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vstr.16 s12, [r2] +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r1, r2, d5 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vstr.16 s8, [r2] +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vstr.16 s3, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll @@ -291,8 +291,8 @@ define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) { ; CHECK-LABEL: trunc_signed_unscaled_i64_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vldrb.s32 q2, [r1] +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vstrw.32 q0, [r0, q2] @@ -310,8 +310,8 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vldrb.u32 q2, [r1] +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vstrw.32 q0, [r0, q2] diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -312,18 +312,18 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: vstr.16 s12, [r1] +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vstr.16 s1, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vstr.16 s3, [r0] @@ -339,10 +339,10 @@ ; CHECK-LABEL: ptr_v4f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vstr.16 s1, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll @@ -52,30 +52,29 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q5, [r0] -; CHECK-NEXT: vmov.f64 d8, d10 -; CHECK-NEXT: vmov.f32 s18, s21 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.f32 s2, s21 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov.f64 d12, d11 -; CHECK-NEXT: vmov.f32 s26, s23 +; CHECK-NEXT: vmov.f32 s2, s23 ; CHECK-NEXT: vmov d8, r0, r1 -; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov.f32 s20, s22 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov d11, r0, r1 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 @@ -84,7 +83,7 @@ ; CHECK-NEXT: vmov d10, r0, r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} entry: %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -118,8 +118,8 @@ ; CHECK-NEXT: vmov.f32 s9, s3 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: bx lr @@ -135,17 +135,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s16, s0 ; CHECK-NEXT: vmov.f32 s13, s4 -; CHECK-NEXT: vmov.f32 s17, s3 ; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s12, s1 ; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vmov.f32 s17, s3 ; CHECK-NEXT: vmov.f32 s19, s9 ; CHECK-NEXT: vadd.i32 q3, q4, q3 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: vadd.i32 q0, q3, q1 ; CHECK-NEXT: vpop {d8, d9} @@ -167,18 +167,18 @@ ; CHECK-NEXT: vmov.f32 s16, s3 ; CHECK-NEXT: vmov.f32 s20, s2 ; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov.f32 s21, s6 ; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: vmov.f32 s22, s10 ; CHECK-NEXT: vmov.f32 s19, s15 +; CHECK-NEXT: vmov.f32 s21, s6 +; CHECK-NEXT: vmov.f32 s22, s10 ; CHECK-NEXT: vmov.f32 s23, s14 ; CHECK-NEXT: vadd.i32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s20, s1 ; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vmov.f32 s22, s9 -; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s3, s12 ; CHECK-NEXT: vadd.i32 q0, q0, q5 ; CHECK-NEXT: vadd.i32 q0, q0, q4 @@ -202,12 +202,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vins.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s1, s6 -; CHECK-NEXT: vins.f16 s1, s6 ; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: @@ -228,14 +228,14 @@ ; CHECK-LABEL: shuffle3_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s1, s7 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vins.f16 s1, s7 ; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s0, s4 ; CHECK-NEXT: vins.f16 s5, s4 -; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vins.f16 s2, s0 ; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmovx.f16 s1, s7 ; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vins.f16 s1, s7 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -323,30 +323,27 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) { ; CHECK-LABEL: shuffle2step_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmovx.f16 s9, s2 +; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vmovx.f16 s10, s4 -; CHECK-NEXT: vmovx.f16 s16, s1 ; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vins.f16 s8, s16 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vins.f16 s8, s0 ; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vmovx.f16 s10, s4 ; CHECK-NEXT: vins.f16 s9, s0 ; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vins.f16 s4, s5 ; CHECK-NEXT: vins.f16 s10, s0 -; CHECK-NEXT: vmov.f32 s13, s2 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmovx.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s11, s6 +; CHECK-NEXT: vmovx.f16 s0, s7 ; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vmov.f32 s13, s2 ; CHECK-NEXT: vins.f16 s11, s0 +; CHECK-NEXT: vmov.f32 s14, s4 ; CHECK-NEXT: vmov.f32 s15, s6 ; CHECK-NEXT: vadd.i16 q0, q3, q2 -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> @@ -358,51 +355,54 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) { ; CHECK-LABEL: shuffle3step_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmovx.f16 s16, s1 +; CHECK-NEXT: .vsave {d11, d12, d13} +; CHECK-NEXT: vpush {d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s12, s0 -; CHECK-NEXT: vins.f16 s12, s16 -; CHECK-NEXT: vmovx.f16 s16, s4 +; CHECK-NEXT: vmovx.f16 s14, s1 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmovx.f16 s14, s4 ; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmovx.f16 s20, s5 -; CHECK-NEXT: vins.f16 s13, s16 -; CHECK-NEXT: vmovx.f16 s16, s7 +; CHECK-NEXT: vmovx.f16 s15, s7 +; CHECK-NEXT: vins.f16 s13, s14 ; CHECK-NEXT: vmov.f32 s14, s6 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vins.f16 s14, s16 -; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vins.f16 s1, s16 +; CHECK-NEXT: vins.f16 s14, s15 +; CHECK-NEXT: vmovx.f16 s15, s2 +; CHECK-NEXT: vins.f16 s1, s15 +; CHECK-NEXT: vmovx.f16 s15, s5 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vmov.f32 s15, s9 -; CHECK-NEXT: vins.f16 s15, s20 -; CHECK-NEXT: vmovx.f16 s20, s11 -; CHECK-NEXT: vins.f16 s10, s20 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vins.f16 s17, s15 ; CHECK-NEXT: vmov.f32 s16, s1 +; CHECK-NEXT: vmovx.f16 s1, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vins.f16 s15, s1 +; CHECK-NEXT: vmovx.f16 s1, s11 +; CHECK-NEXT: vins.f16 s10, s1 +; CHECK-NEXT: vmovx.f16 s1, s3 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmovx.f16 s7, s9 ; CHECK-NEXT: vmov.f32 s23, s10 ; CHECK-NEXT: vmov.f32 s22, s8 -; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s1, s5 ; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vins.f16 s6, s8 +; CHECK-NEXT: vins.f16 s7, s11 ; CHECK-NEXT: vmovnb.i32 q6, q4 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmovnb.i32 q2, q0 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmov.f32 s2, s10 ; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s20, s2 -; CHECK-NEXT: vmovx.f16 s21, s3 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vmovx.f16 s3, s9 -; CHECK-NEXT: vins.f16 s21, s5 -; CHECK-NEXT: vins.f16 s3, s11 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovnb.i32 q1, q5 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s23, s3 -; CHECK-NEXT: vadd.i16 q0, q3, q5 +; CHECK-NEXT: vadd.i16 q0, q3, q0 ; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> @@ -416,53 +416,51 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) { ; CHECK-LABEL: shuffle4step_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmovx.f16 s18, s9 -; CHECK-NEXT: vins.f16 s18, s20 -; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmovx.f16 s16, s11 +; CHECK-NEXT: vins.f16 s18, s16 ; CHECK-NEXT: vmovx.f16 s19, s13 -; CHECK-NEXT: vins.f16 s9, s11 -; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmovx.f16 s16, s15 ; CHECK-NEXT: vmovx.f16 s20, s3 +; CHECK-NEXT: vins.f16 s19, s16 ; CHECK-NEXT: vmovx.f16 s16, s1 -; CHECK-NEXT: vins.f16 s13, s15 ; CHECK-NEXT: vins.f16 s16, s20 -; CHECK-NEXT: vmovx.f16 s20, s7 ; CHECK-NEXT: vmovx.f16 s17, s5 +; CHECK-NEXT: vmovx.f16 s20, s7 +; CHECK-NEXT: vins.f16 s9, s11 +; CHECK-NEXT: vins.f16 s13, s15 +; CHECK-NEXT: vins.f16 s5, s7 ; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vins.f16 s17, s20 +; CHECK-NEXT: vmov.f32 s20, s1 +; CHECK-NEXT: vmovx.f16 s1, s10 ; CHECK-NEXT: vmov.f32 s22, s9 ; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmovx.f16 s24, s10 ; CHECK-NEXT: vmov.f32 s21, s5 ; CHECK-NEXT: vadd.i16 q4, q5, q4 ; CHECK-NEXT: vmovx.f16 s22, s8 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s24, s14 +; CHECK-NEXT: vins.f16 s22, s1 ; CHECK-NEXT: vmovx.f16 s23, s12 -; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vmov.f32 s10, s8 -; CHECK-NEXT: vmov.f32 s11, s12 -; CHECK-NEXT: vins.f16 s23, s24 -; CHECK-NEXT: vmovx.f16 s24, s2 +; CHECK-NEXT: vmovx.f16 s1, s14 ; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vins.f16 s23, s1 +; CHECK-NEXT: vmovx.f16 s1, s2 +; CHECK-NEXT: vins.f16 s20, s1 ; CHECK-NEXT: vmovx.f16 s21, s4 -; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vins.f16 s8, s10 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vins.f16 s21, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmov.f32 s3, s12 ; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vadd.i16 q0, q0, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> @@ -769,12 +767,11 @@ ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] ; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.f32 s15, s19 ; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vmov.f32 s15, s19 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[3] ; CHECK-NEXT: vmov.8 q4[1], r0 @@ -797,19 +794,20 @@ ; CHECK-NEXT: vmov.u8 r0, q1[14] ; CHECK-NEXT: vmov.8 q4[10], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] +; CHECK-NEXT: vmov.f32 s14, s22 ; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[7] +; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vmov.8 q5[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[10] ; CHECK-NEXT: vmov.8 q5[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] ; CHECK-NEXT: vmov.8 q5[15], r0 ; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vmov.8 q6[11], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.f32 s18, s26 ; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vadd.i8 q3, q4, q3 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[5] @@ -1028,8 +1026,8 @@ ; CHECK-LABEL: shuffle2_i64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -1146,8 +1144,8 @@ ; CHECKFP-NEXT: vmov.f32 s9, s3 ; CHECKFP-NEXT: vmov.f32 s1, s2 ; CHECKFP-NEXT: vmov.f32 s10, s5 -; CHECKFP-NEXT: vmov.f32 s2, s4 ; CHECKFP-NEXT: vmov.f32 s11, s7 +; CHECKFP-NEXT: vmov.f32 s2, s4 ; CHECKFP-NEXT: vmov.f32 s3, s6 ; CHECKFP-NEXT: vadd.f32 q0, q0, q2 ; CHECKFP-NEXT: bx lr @@ -1163,17 +1161,17 @@ ; CHECKFP: @ %bb.0: @ %entry ; CHECKFP-NEXT: .vsave {d8, d9} ; CHECKFP-NEXT: vpush {d8, d9} -; CHECKFP-NEXT: vmov.f32 s12, s1 -; CHECKFP-NEXT: vmov.f32 s16, s0 ; CHECKFP-NEXT: vmov.f32 s13, s4 -; CHECKFP-NEXT: vmov.f32 s17, s3 ; CHECKFP-NEXT: vmov.f32 s14, s7 ; CHECKFP-NEXT: vmov.f32 s18, s6 -; CHECKFP-NEXT: vmov.f32 s4, s2 -; CHECKFP-NEXT: vmov.f32 s6, s8 +; CHECKFP-NEXT: vmov.f32 s12, s1 ; CHECKFP-NEXT: vmov.f32 s15, s10 +; CHECKFP-NEXT: vmov.f32 s16, s0 +; CHECKFP-NEXT: vmov.f32 s17, s3 ; CHECKFP-NEXT: vmov.f32 s19, s9 ; CHECKFP-NEXT: vadd.f32 q3, q4, q3 +; CHECKFP-NEXT: vmov.f32 s4, s2 +; CHECKFP-NEXT: vmov.f32 s6, s8 ; CHECKFP-NEXT: vmov.f32 s7, s11 ; CHECKFP-NEXT: vadd.f32 q0, q3, q1 ; CHECKFP-NEXT: vpop {d8, d9} @@ -1195,18 +1193,18 @@ ; CHECKFP-NEXT: vmov.f32 s16, s3 ; CHECKFP-NEXT: vmov.f32 s20, s2 ; CHECKFP-NEXT: vmov.f32 s17, s7 -; CHECKFP-NEXT: vmov.f32 s21, s6 ; CHECKFP-NEXT: vmov.f32 s18, s11 -; CHECKFP-NEXT: vmov.f32 s22, s10 ; CHECKFP-NEXT: vmov.f32 s19, s15 +; CHECKFP-NEXT: vmov.f32 s21, s6 +; CHECKFP-NEXT: vmov.f32 s22, s10 ; CHECKFP-NEXT: vmov.f32 s23, s14 ; CHECKFP-NEXT: vadd.f32 q4, q5, q4 ; CHECKFP-NEXT: vmov.f32 s20, s1 ; CHECKFP-NEXT: vmov.f32 s21, s5 -; CHECKFP-NEXT: vmov.f32 s1, s4 ; CHECKFP-NEXT: vmov.f32 s22, s9 -; CHECKFP-NEXT: vmov.f32 s2, s8 ; CHECKFP-NEXT: vmov.f32 s23, s13 +; CHECKFP-NEXT: vmov.f32 s1, s4 +; CHECKFP-NEXT: vmov.f32 s2, s8 ; CHECKFP-NEXT: vmov.f32 s3, s12 ; CHECKFP-NEXT: vadd.f32 q0, q0, q5 ; CHECKFP-NEXT: vadd.f32 q0, q0, q4 @@ -1230,12 +1228,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vins.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s1, s6 -; CHECK-NEXT: vins.f16 s1, s6 ; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: @@ -1256,14 +1254,14 @@ ; CHECK-LABEL: shuffle3_f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s1, s7 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vins.f16 s1, s7 ; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s0, s4 ; CHECK-NEXT: vins.f16 s5, s4 -; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vins.f16 s2, s0 ; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmovx.f16 s1, s7 ; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vins.f16 s1, s7 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> @@ -1340,24 +1338,24 @@ define arm_aapcs_vfpcc <8 x half> @shuffle2step_f16(<16 x half> %src) { ; CHECKFP-LABEL: shuffle2step_f16: ; CHECKFP: @ %bb.0: @ %entry -; CHECKFP-NEXT: vmovx.f16 s12, s1 ; CHECKFP-NEXT: vmovx.f16 s8, s0 -; CHECKFP-NEXT: vins.f16 s8, s12 -; CHECKFP-NEXT: vmovx.f16 s12, s3 +; CHECKFP-NEXT: vmovx.f16 s10, s1 +; CHECKFP-NEXT: vins.f16 s8, s10 ; CHECKFP-NEXT: vmovx.f16 s9, s2 -; CHECKFP-NEXT: vins.f16 s0, s1 -; CHECKFP-NEXT: vins.f16 s9, s12 -; CHECKFP-NEXT: vins.f16 s2, s3 +; CHECKFP-NEXT: vmovx.f16 s10, s3 ; CHECKFP-NEXT: vmovx.f16 s12, s5 +; CHECKFP-NEXT: vins.f16 s9, s10 ; CHECKFP-NEXT: vmovx.f16 s10, s4 ; CHECKFP-NEXT: vins.f16 s10, s12 -; CHECKFP-NEXT: vins.f16 s4, s5 -; CHECKFP-NEXT: vmov.f32 s1, s2 -; CHECKFP-NEXT: vmovx.f16 s12, s7 ; CHECKFP-NEXT: vmovx.f16 s11, s6 +; CHECKFP-NEXT: vmovx.f16 s12, s7 +; CHECKFP-NEXT: vins.f16 s2, s3 ; CHECKFP-NEXT: vins.f16 s6, s7 -; CHECKFP-NEXT: vmov.f32 s2, s4 +; CHECKFP-NEXT: vins.f16 s4, s5 +; CHECKFP-NEXT: vins.f16 s0, s1 +; CHECKFP-NEXT: vmov.f32 s1, s2 ; CHECKFP-NEXT: vins.f16 s11, s12 +; CHECKFP-NEXT: vmov.f32 s2, s4 ; CHECKFP-NEXT: vmov.f32 s3, s6 ; CHECKFP-NEXT: vadd.f16 q0, q0, q2 ; CHECKFP-NEXT: bx lr @@ -1371,45 +1369,43 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) { ; CHECKFP-LABEL: shuffle3step_f16: ; CHECKFP: @ %bb.0: @ %entry -; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECKFP-NEXT: vmovx.f16 s16, s2 +; CHECKFP-NEXT: .vsave {d8, d9, d10} +; CHECKFP-NEXT: vpush {d8, d9, d10} ; CHECKFP-NEXT: vmov.f32 s12, s1 -; CHECKFP-NEXT: vins.f16 s12, s16 -; CHECKFP-NEXT: vmovx.f16 s16, s5 +; CHECKFP-NEXT: vmovx.f16 s14, s2 +; CHECKFP-NEXT: vins.f16 s12, s14 ; CHECKFP-NEXT: vmov.f32 s13, s4 -; CHECKFP-NEXT: vmovx.f16 s20, s11 -; CHECKFP-NEXT: vins.f16 s13, s16 -; CHECKFP-NEXT: vmov.f32 s19, s10 -; CHECKFP-NEXT: vins.f16 s19, s20 +; CHECKFP-NEXT: vmovx.f16 s14, s5 +; CHECKFP-NEXT: vmov.f32 s15, s10 +; CHECKFP-NEXT: vins.f16 s13, s14 +; CHECKFP-NEXT: vmovx.f16 s14, s11 +; CHECKFP-NEXT: vins.f16 s15, s14 ; CHECKFP-NEXT: vmov.f32 s14, s7 -; CHECKFP-NEXT: vmovx.f16 s20, s8 -; CHECKFP-NEXT: vmov.f32 s28, s6 -; CHECKFP-NEXT: vins.f16 s14, s20 -; CHECKFP-NEXT: vmovx.f16 s20, s7 -; CHECKFP-NEXT: vins.f16 s28, s20 -; CHECKFP-NEXT: vmovx.f16 s24, s1 -; CHECKFP-NEXT: vmovx.f16 s20, s0 -; CHECKFP-NEXT: vins.f16 s0, s24 -; CHECKFP-NEXT: vins.f16 s20, s2 -; CHECKFP-NEXT: vmovx.f16 s26, s4 -; CHECKFP-NEXT: vmovx.f16 s21, s3 -; CHECKFP-NEXT: vins.f16 s3, s26 -; CHECKFP-NEXT: vins.f16 s21, s5 -; CHECKFP-NEXT: vmovx.f16 s30, s10 -; CHECKFP-NEXT: vmovx.f16 s23, s9 -; CHECKFP-NEXT: vmov.f32 s18, s8 +; CHECKFP-NEXT: vmovx.f16 s16, s8 +; CHECKFP-NEXT: vmovx.f16 s4, s4 +; CHECKFP-NEXT: vmovx.f16 s7, s7 +; CHECKFP-NEXT: vmov.f32 s20, s6 +; CHECKFP-NEXT: vmovx.f16 s10, s10 +; CHECKFP-NEXT: vmovx.f16 s17, s3 +; CHECKFP-NEXT: vmovx.f16 s19, s9 +; CHECKFP-NEXT: vmovx.f16 s18, s6 +; CHECKFP-NEXT: vins.f16 s14, s16 +; CHECKFP-NEXT: vmovx.f16 s16, s0 +; CHECKFP-NEXT: vmovx.f16 s1, s1 +; CHECKFP-NEXT: vins.f16 s20, s7 +; CHECKFP-NEXT: vins.f16 s3, s4 +; CHECKFP-NEXT: vins.f16 s9, s10 +; CHECKFP-NEXT: vins.f16 s0, s1 +; CHECKFP-NEXT: vins.f16 s16, s2 ; CHECKFP-NEXT: vmov.f32 s1, s3 -; CHECKFP-NEXT: vins.f16 s9, s30 -; CHECKFP-NEXT: vins.f16 s23, s11 -; CHECKFP-NEXT: vmov.f32 s2, s28 -; CHECKFP-NEXT: vmovx.f16 s22, s6 +; CHECKFP-NEXT: vins.f16 s17, s5 +; CHECKFP-NEXT: vins.f16 s19, s11 +; CHECKFP-NEXT: vins.f16 s18, s8 +; CHECKFP-NEXT: vmov.f32 s2, s20 ; CHECKFP-NEXT: vmov.f32 s3, s9 -; CHECKFP-NEXT: vins.f16 s22, s8 -; CHECKFP-NEXT: vmov.f32 s15, s19 -; CHECKFP-NEXT: vadd.f16 q0, q0, q5 +; CHECKFP-NEXT: vadd.f16 q0, q0, q4 ; CHECKFP-NEXT: vadd.f16 q0, q0, q3 -; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECKFP-NEXT: vpop {d8, d9, d10} ; CHECKFP-NEXT: bx lr entry: %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> @@ -1425,47 +1421,47 @@ ; CHECKFP: @ %bb.0: @ %entry ; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECKFP-NEXT: vmovx.f16 s20, s11 ; CHECKFP-NEXT: vmovx.f16 s18, s9 -; CHECKFP-NEXT: vins.f16 s18, s20 -; CHECKFP-NEXT: vmovx.f16 s20, s15 +; CHECKFP-NEXT: vmovx.f16 s16, s11 +; CHECKFP-NEXT: vins.f16 s18, s16 ; CHECKFP-NEXT: vmovx.f16 s19, s13 -; CHECKFP-NEXT: vins.f16 s9, s11 -; CHECKFP-NEXT: vins.f16 s19, s20 -; CHECKFP-NEXT: vmovx.f16 s20, s3 +; CHECKFP-NEXT: vmovx.f16 s16, s15 +; CHECKFP-NEXT: vmovx.f16 s22, s8 +; CHECKFP-NEXT: vins.f16 s19, s16 ; CHECKFP-NEXT: vmovx.f16 s16, s1 -; CHECKFP-NEXT: vmovx.f16 s24, s10 +; CHECKFP-NEXT: vmovx.f16 s20, s3 +; CHECKFP-NEXT: vins.f16 s1, s3 +; CHECKFP-NEXT: vmovx.f16 s3, s10 ; CHECKFP-NEXT: vins.f16 s16, s20 -; CHECKFP-NEXT: vmovx.f16 s20, s7 ; CHECKFP-NEXT: vmovx.f16 s17, s5 -; CHECKFP-NEXT: vins.f16 s13, s15 -; CHECKFP-NEXT: vins.f16 s17, s20 -; CHECKFP-NEXT: vmovx.f16 s22, s8 -; CHECKFP-NEXT: vins.f16 s22, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s14 +; CHECKFP-NEXT: vmovx.f16 s20, s7 +; CHECKFP-NEXT: vins.f16 s22, s3 ; CHECKFP-NEXT: vmovx.f16 s23, s12 -; CHECKFP-NEXT: vins.f16 s1, s3 -; CHECKFP-NEXT: vins.f16 s23, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s2 +; CHECKFP-NEXT: vmovx.f16 s3, s14 +; CHECKFP-NEXT: vins.f16 s17, s20 +; CHECKFP-NEXT: vins.f16 s23, s3 ; CHECKFP-NEXT: vmovx.f16 s20, s0 +; CHECKFP-NEXT: vmovx.f16 s3, s2 +; CHECKFP-NEXT: vins.f16 s9, s11 +; CHECKFP-NEXT: vins.f16 s13, s15 ; CHECKFP-NEXT: vins.f16 s5, s7 -; CHECKFP-NEXT: vins.f16 s20, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s6 +; CHECKFP-NEXT: vins.f16 s20, s3 ; CHECKFP-NEXT: vmovx.f16 s21, s4 +; CHECKFP-NEXT: vmovx.f16 s3, s6 ; CHECKFP-NEXT: vins.f16 s8, s10 -; CHECKFP-NEXT: vins.f16 s21, s24 -; CHECKFP-NEXT: vmov.f32 s26, s9 ; CHECKFP-NEXT: vins.f16 s12, s14 -; CHECKFP-NEXT: vins.f16 s0, s2 -; CHECKFP-NEXT: vmov.f32 s27, s13 ; CHECKFP-NEXT: vins.f16 s4, s6 +; CHECKFP-NEXT: vins.f16 s21, s3 +; CHECKFP-NEXT: vins.f16 s0, s2 ; CHECKFP-NEXT: vmov.f32 s24, s1 +; CHECKFP-NEXT: vmov.f32 s26, s9 +; CHECKFP-NEXT: vmov.f32 s27, s13 +; CHECKFP-NEXT: vmov.f32 s25, s5 ; CHECKFP-NEXT: vmov.f32 s2, s8 +; CHECKFP-NEXT: vadd.f16 q4, q6, q4 ; CHECKFP-NEXT: vmov.f32 s3, s12 ; CHECKFP-NEXT: vmov.f32 s1, s4 -; CHECKFP-NEXT: vmov.f32 s25, s5 ; CHECKFP-NEXT: vadd.f16 q0, q0, q5 -; CHECKFP-NEXT: vadd.f16 q4, q6, q4 ; CHECKFP-NEXT: vadd.f16 q0, q0, q4 ; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECKFP-NEXT: bx lr @@ -1495,8 +1491,8 @@ ; CHECK-LABEL: shuffle2_f64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -1559,7 +1555,6 @@ define arm_aapcs_vfpcc <4 x float> @insert_f32(float %a) { ; CHECK-LABEL: insert_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: bx lr entry: %res = insertelement <4 x float> undef, float %a, i32 0 @@ -1569,7 +1564,6 @@ define arm_aapcs_vfpcc <8 x half> @insert_f16(half %a) { ; CHECK-LABEL: insert_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: bx lr entry: %res = insertelement <8 x half> undef, half %a, i32 0 @@ -1579,7 +1573,6 @@ define arm_aapcs_vfpcc <2 x double> @insert_f64(double %a) { ; CHECK-LABEL: insert_f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: bx lr entry: %res = insertelement <2 x double> undef, double %a, i32 0 @@ -1696,7 +1689,6 @@ define arm_aapcs_vfpcc float @extract_f32_0(<4 x float> %a) { ; CHECK-LABEL: extract_f32_0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bx lr entry: %res = extractelement <4 x float> %a, i32 0 @@ -1716,7 +1708,6 @@ define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) { ; CHECK-LABEL: extract_f16_0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bx lr entry: %res = extractelement <8 x half> %a, i32 0 @@ -1736,7 +1727,6 @@ define arm_aapcs_vfpcc double @extract_f64_0(<2 x double> %a) { ; CHECK-LABEL: extract_f64_0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bx lr entry: %res = extractelement <2 x double> %a, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll --- a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll @@ -52,10 +52,10 @@ ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrh.s32 q2, [r0] +; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q1, [r0, #8] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: add sp, #16 @@ -94,9 +94,9 @@ ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: @@ -126,17 +126,17 @@ ; CHECK-NEXT: add r1, sp, #16 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vldrh.s32 q2, [r0] +; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q1, [r0, #8] -; CHECK-NEXT: vldrh.s32 q3, [r1] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.f32 s1, s11 ; CHECK-NEXT: vldrh.s32 q2, [r1, #8] +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.f32 s4, s13 -; CHECK-NEXT: vmov.f32 s5, s15 +; CHECK-NEXT: vldrh.s32 q1, [r1] ; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr @@ -195,10 +195,10 @@ ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q2, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: add sp, #16 @@ -237,9 +237,9 @@ ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: @@ -269,17 +269,17 @@ ; CHECK-NEXT: add r1, sp, #16 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vldrh.u32 q2, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vldrh.u32 q3, [r1] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.f32 s1, s11 ; CHECK-NEXT: vldrh.u32 q2, [r1, #8] +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.f32 s4, s13 -; CHECK-NEXT: vmov.f32 s5, s15 +; CHECK-NEXT: vldrh.u32 q1, [r1] ; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll --- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll @@ -37,12 +37,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vins.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s1, s6 -; CHECK-NEXT: vins.f16 s1, s6 ; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: @@ -340,12 +340,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vins.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s1, s6 -; CHECK-NEXT: vins.f16 s1, s6 ; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll --- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll +++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll @@ -56,11 +56,10 @@ define arm_aapcs_vfpcc <4 x float> @add_float32_t(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: add_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vadd.f32 s11, s7, s3 -; CHECK-MVE-NEXT: vadd.f32 s10, s6, s2 -; CHECK-MVE-NEXT: vadd.f32 s9, s5, s1 -; CHECK-MVE-NEXT: vadd.f32 s8, s4, s0 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vadd.f32 s3, s7, s3 +; CHECK-MVE-NEXT: vadd.f32 s2, s6, s2 +; CHECK-MVE-NEXT: vadd.f32 s1, s5, s1 +; CHECK-MVE-NEXT: vadd.f32 s0, s4, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: add_float32_t: @@ -75,27 +74,26 @@ define arm_aapcs_vfpcc <8 x half> @add_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: add_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vmovx.f16 s2, s4 -; CHECK-MVE-NEXT: vmovx.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vadd.f16 s12, s2, s0 -; CHECK-MVE-NEXT: vadd.f16 s0, s4, s8 -; CHECK-MVE-NEXT: vins.f16 s0, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vadd.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vins.f16 s1, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vadd.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vins.f16 s2, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vadd.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vins.f16 s3, s12 +; CHECK-MVE-NEXT: vmovx.f16 s8, s0 +; CHECK-MVE-NEXT: vmovx.f16 s10, s4 +; CHECK-MVE-NEXT: vadd.f16 s0, s4, s0 +; CHECK-MVE-NEXT: vadd.f16 s8, s10, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vadd.f16 s1, s5, s1 +; CHECK-MVE-NEXT: vadd.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vadd.f16 s2, s6, s2 +; CHECK-MVE-NEXT: vadd.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 +; CHECK-MVE-NEXT: vadd.f16 s3, s7, s3 +; CHECK-MVE-NEXT: vadd.f16 s4, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: add_float16_t: @@ -189,11 +187,10 @@ define arm_aapcs_vfpcc <4 x float> @sub_float32_t(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: sub_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vsub.f32 s11, s7, s3 -; CHECK-MVE-NEXT: vsub.f32 s10, s6, s2 -; CHECK-MVE-NEXT: vsub.f32 s9, s5, s1 -; CHECK-MVE-NEXT: vsub.f32 s8, s4, s0 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vsub.f32 s3, s7, s3 +; CHECK-MVE-NEXT: vsub.f32 s2, s6, s2 +; CHECK-MVE-NEXT: vsub.f32 s1, s5, s1 +; CHECK-MVE-NEXT: vsub.f32 s0, s4, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: sub_float32_t: @@ -208,27 +205,26 @@ define arm_aapcs_vfpcc <8 x half> @sub_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: sub_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vmovx.f16 s2, s4 -; CHECK-MVE-NEXT: vmovx.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vsub.f16 s12, s2, s0 -; CHECK-MVE-NEXT: vsub.f16 s0, s4, s8 -; CHECK-MVE-NEXT: vins.f16 s0, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vsub.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vins.f16 s1, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vsub.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vins.f16 s2, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vsub.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vins.f16 s3, s12 +; CHECK-MVE-NEXT: vmovx.f16 s8, s0 +; CHECK-MVE-NEXT: vmovx.f16 s10, s4 +; CHECK-MVE-NEXT: vsub.f16 s0, s4, s0 +; CHECK-MVE-NEXT: vsub.f16 s8, s10, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vsub.f16 s1, s5, s1 +; CHECK-MVE-NEXT: vsub.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vsub.f16 s2, s6, s2 +; CHECK-MVE-NEXT: vsub.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 +; CHECK-MVE-NEXT: vsub.f16 s3, s7, s3 +; CHECK-MVE-NEXT: vsub.f16 s4, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: sub_float16_t: @@ -324,27 +320,26 @@ define arm_aapcs_vfpcc <8 x half> @mul_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: mul_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vmovx.f16 s2, s4 -; CHECK-MVE-NEXT: vmovx.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vmul.f16 s12, s2, s0 -; CHECK-MVE-NEXT: vmul.f16 s0, s4, s8 -; CHECK-MVE-NEXT: vins.f16 s0, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmul.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vins.f16 s1, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vmul.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vins.f16 s2, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmul.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vins.f16 s3, s12 +; CHECK-MVE-NEXT: vmovx.f16 s8, s0 +; CHECK-MVE-NEXT: vmovx.f16 s10, s4 +; CHECK-MVE-NEXT: vmul.f16 s0, s4, s0 +; CHECK-MVE-NEXT: vmul.f16 s8, s10, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vmul.f16 s1, s5, s1 +; CHECK-MVE-NEXT: vmul.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vmul.f16 s2, s6, s2 +; CHECK-MVE-NEXT: vmul.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 +; CHECK-MVE-NEXT: vmul.f16 s3, s7, s3 +; CHECK-MVE-NEXT: vmul.f16 s4, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: mul_float16_t: @@ -359,11 +354,10 @@ define arm_aapcs_vfpcc <4 x float> @mul_float32_t(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: mul_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmul.f32 s11, s7, s3 -; CHECK-MVE-NEXT: vmul.f32 s10, s6, s2 -; CHECK-MVE-NEXT: vmul.f32 s9, s5, s1 -; CHECK-MVE-NEXT: vmul.f32 s8, s4, s0 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vmul.f32 s3, s7, s3 +; CHECK-MVE-NEXT: vmul.f32 s2, s6, s2 +; CHECK-MVE-NEXT: vmul.f32 s1, s5, s1 +; CHECK-MVE-NEXT: vmul.f32 s0, s4, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: mul_float32_t: diff --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll --- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll +++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll @@ -6,10 +6,10 @@ define <16 x i8> @vector_add_i8(<16 x i8> %lhs, <16 x i8> %rhs) { ; CHECK-LE-LABEL: vector_add_i8: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vmov d0, r0, r1 ; CHECK-LE-NEXT: mov r0, sp ; CHECK-LE-NEXT: vldrw.u32 q1, [r0] +; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vadd.i8 q0, q0, q1 ; CHECK-LE-NEXT: vmov r0, r1, d0 ; CHECK-LE-NEXT: vmov r2, r3, d1 @@ -17,9 +17,9 @@ ; ; CHECK-BE-LABEL: vector_add_i8: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vldrb.u8 q0, [r0] ; CHECK-BE-NEXT: vadd.i8 q0, q1, q0 @@ -35,10 +35,10 @@ define <8 x i16> @vector_add_i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; CHECK-LE-LABEL: vector_add_i16: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vmov d0, r0, r1 ; CHECK-LE-NEXT: mov r0, sp ; CHECK-LE-NEXT: vldrw.u32 q1, [r0] +; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vadd.i16 q0, q0, q1 ; CHECK-LE-NEXT: vmov r0, r1, d0 ; CHECK-LE-NEXT: vmov r2, r3, d1 @@ -46,9 +46,9 @@ ; ; CHECK-BE-LABEL: vector_add_i16: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vldrh.u16 q0, [r0] ; CHECK-BE-NEXT: vadd.i16 q0, q1, q0 @@ -64,10 +64,10 @@ define <4 x i32> @vector_add_i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LE-LABEL: vector_add_i32: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vmov d0, r0, r1 ; CHECK-LE-NEXT: mov r0, sp ; CHECK-LE-NEXT: vldrw.u32 q1, [r0] +; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vadd.i32 q0, q0, q1 ; CHECK-LE-NEXT: vmov r0, r1, d0 ; CHECK-LE-NEXT: vmov r2, r3, d1 @@ -75,9 +75,9 @@ ; ; CHECK-BE-LABEL: vector_add_i32: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0] ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 @@ -144,10 +144,10 @@ ; CHECK-MVE-NEXT: push {r4, r5, r7, lr} ; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-MVE-NEXT: vmov d9, r2, r3 ; CHECK-MVE-NEXT: vmov d8, r0, r1 ; CHECK-MVE-NEXT: add r0, sp, #64 ; CHECK-MVE-NEXT: vldrw.u32 q6, [r0] +; CHECK-MVE-NEXT: vmov d9, r2, r3 ; CHECK-MVE-NEXT: vmov.u16 r4, q4[0] ; CHECK-MVE-NEXT: vmov.u16 r0, q6[0] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -239,13 +239,13 @@ ; CHECK-BE-NEXT: push {r4, r5, r7, lr} ; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-BE-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: add r0, sp, #64 ; CHECK-BE-NEXT: vldrh.u16 q6, [r0] +; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.16 q4, q0 -; CHECK-BE-NEXT: vmov.u16 r4, q4[0] ; CHECK-BE-NEXT: vmov.u16 r0, q6[0] +; CHECK-BE-NEXT: vmov.u16 r4, q4[0] ; CHECK-BE-NEXT: bl __aeabi_h2f ; CHECK-BE-NEXT: mov r5, r0 ; CHECK-BE-NEXT: mov r0, r4 @@ -332,10 +332,10 @@ ; ; CHECK-FP-LABEL: vector_add_f16: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov d1, r2, r3 ; CHECK-FP-NEXT: vmov d0, r0, r1 ; CHECK-FP-NEXT: mov r0, sp ; CHECK-FP-NEXT: vldrw.u32 q1, [r0] +; CHECK-FP-NEXT: vmov d1, r2, r3 ; CHECK-FP-NEXT: vadd.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov r0, r1, d0 ; CHECK-FP-NEXT: vmov r2, r3, d1 @@ -352,21 +352,21 @@ ; CHECK-MVE-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-MVE-NEXT: .pad #4 ; CHECK-MVE-NEXT: sub sp, #4 -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: mov r4, r0 -; CHECK-MVE-NEXT: add r0, sp, #56 -; CHECK-MVE-NEXT: vldrw.u32 q5, [r0] +; CHECK-MVE-NEXT: add r0, sp, #40 +; CHECK-MVE-NEXT: vldrw.u32 q4, [r0] ; CHECK-MVE-NEXT: mov r6, r1 ; CHECK-MVE-NEXT: mov r0, r3 ; CHECK-MVE-NEXT: mov r5, r2 -; CHECK-MVE-NEXT: vmov r7, r1, d11 +; CHECK-MVE-NEXT: vmov r7, r1, d9 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: vmov s19, r0 ; CHECK-MVE-NEXT: mov r0, r5 ; CHECK-MVE-NEXT: mov r1, r7 ; CHECK-MVE-NEXT: bl __aeabi_fadd -; CHECK-MVE-NEXT: vmov r5, r1, d10 +; CHECK-MVE-NEXT: vmov r5, r1, d8 ; CHECK-MVE-NEXT: vmov s18, r0 ; CHECK-MVE-NEXT: mov r0, r6 ; CHECK-MVE-NEXT: bl __aeabi_fadd @@ -377,7 +377,7 @@ ; CHECK-MVE-NEXT: vmov s16, r0 ; CHECK-MVE-NEXT: vmov r2, r3, d9 ; CHECK-MVE-NEXT: vmov r0, r1, d8 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: add sp, #4 ; CHECK-MVE-NEXT: pop {r4, r5, r6, r7, pc} ; @@ -385,23 +385,23 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .save {r4, r5, r7, lr} ; CHECK-BE-NEXT: push {r4, r5, r7, lr} -; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-BE-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-BE-NEXT: vmov d1, r3, r2 +; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-BE-NEXT: vpush {d8, d9, d10, d11} ; CHECK-BE-NEXT: vmov d0, r1, r0 -; CHECK-BE-NEXT: add r1, sp, #64 -; CHECK-BE-NEXT: vldrw.u32 q6, [r1] -; CHECK-BE-NEXT: vrev64.32 q5, q0 -; CHECK-BE-NEXT: vmov r4, r0, d11 -; CHECK-BE-NEXT: vmov r5, r1, d13 +; CHECK-BE-NEXT: add r1, sp, #48 +; CHECK-BE-NEXT: vldrw.u32 q5, [r1] +; CHECK-BE-NEXT: vmov d1, r3, r2 +; CHECK-BE-NEXT: vrev64.32 q4, q0 +; CHECK-BE-NEXT: vmov r4, r0, d9 +; CHECK-BE-NEXT: vmov r5, r1, d11 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: vmov s19, r0 ; CHECK-BE-NEXT: mov r0, r4 ; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: vmov s18, r0 -; CHECK-BE-NEXT: vmov r4, r0, d10 -; CHECK-BE-NEXT: vmov r5, r1, d12 +; CHECK-BE-NEXT: vmov r4, r0, d8 +; CHECK-BE-NEXT: vmov r5, r1, d10 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: vmov s17, r0 ; CHECK-BE-NEXT: mov r0, r4 @@ -411,15 +411,15 @@ ; CHECK-BE-NEXT: vrev64.32 q0, q4 ; CHECK-BE-NEXT: vmov r1, r0, d0 ; CHECK-BE-NEXT: vmov r3, r2, d1 -; CHECK-BE-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-BE-NEXT: vpop {d8, d9, d10, d11} ; CHECK-BE-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-FP-LABEL: vector_add_f32: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov d1, r2, r3 ; CHECK-FP-NEXT: vmov d0, r0, r1 ; CHECK-FP-NEXT: mov r0, sp ; CHECK-FP-NEXT: vldrw.u32 q1, [r0] +; CHECK-FP-NEXT: vmov d1, r2, r3 ; CHECK-FP-NEXT: vadd.f32 q0, q0, q1 ; CHECK-FP-NEXT: vmov r0, r1, d0 ; CHECK-FP-NEXT: vmov r2, r3, d1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -186,8 +186,8 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} ; CHECK-NEXT: mov.w lr, #256 ; CHECK-NEXT: mov.w r12, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 @@ -195,12 +195,13 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov.f32 s12, s4 -; CHECK-NEXT: vmov.f32 s16, s8 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r5, s8 ; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: vmov.f32 s18, s9 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov.f32 s8, s10 ; CHECK-NEXT: vmov r7, s18 ; CHECK-NEXT: asrs r4, r3, #31 ; CHECK-NEXT: subs.w r8, r3, r5 @@ -209,24 +210,21 @@ ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: bfi r4, r5, #0, #4 ; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: subs.w r9, r5, r7 ; CHECK-NEXT: asr.w r6, r5, #31 -; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r5, s4 ; CHECK-NEXT: sbc.w r6, r6, r7, asr #31 ; CHECK-NEXT: and.w r6, r12, r6, asr #31 ; CHECK-NEXT: rsbs r6, r6, #0 ; CHECK-NEXT: bfi r4, r6, #4, #4 -; CHECK-NEXT: vmov r6, s14 +; CHECK-NEXT: vmov r6, s6 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: asrs r7, r6, #31 ; CHECK-NEXT: subs.w r10, r6, r3 -; CHECK-NEXT: asr.w r7, r6, #31 +; CHECK-NEXT: asr.w r6, r5, #31 ; CHECK-NEXT: sbc.w r3, r7, r3, asr #31 -; CHECK-NEXT: vmov r7, s4 -; CHECK-NEXT: asrs r6, r5, #31 +; CHECK-NEXT: vmov r7, s8 ; CHECK-NEXT: asr.w r11, r3, #31 ; CHECK-NEXT: and.w r3, r12, r3, asr #31 ; CHECK-NEXT: rsbs r3, r3, #0 @@ -247,7 +245,7 @@ ; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: @@ -390,7 +388,13 @@ ; CHECK-NEXT: vmov.f32 s20, s12 ; CHECK-NEXT: vmov.f32 s22, s13 ; CHECK-NEXT: vand q5, q5, q0 +; CHECK-NEXT: vmov.f32 s8, s10 ; CHECK-NEXT: vmov r5, r6, d10 +; CHECK-NEXT: vmov.f32 s10, s11 +; CHECK-NEXT: vmov.f32 s12, s14 +; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vmov.f32 s14, s15 +; CHECK-NEXT: vand q3, q3, q0 ; CHECK-NEXT: subs.w r8, r5, r3 ; CHECK-NEXT: vmov r7, r3, d11 ; CHECK-NEXT: sbc.w r4, r6, r4 @@ -398,12 +402,6 @@ ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: bfi r4, r5, #0, #4 ; CHECK-NEXT: vmov r5, r6, d9 -; CHECK-NEXT: vmov.f32 s16, s10 -; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: vand q2, q4, q0 -; CHECK-NEXT: vmov.f32 s16, s14 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vand q3, q4, q0 ; CHECK-NEXT: subs.w r9, r7, r5 ; CHECK-NEXT: mov.w r7, #1 ; CHECK-NEXT: sbcs r3, r6 diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll @@ -759,8 +759,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oeq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -769,15 +769,15 @@ ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -785,76 +785,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16: @@ -871,8 +870,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_one_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -884,16 +883,16 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: vmovx.f16 s18, s12 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -901,11 +900,10 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -913,11 +911,11 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -925,14 +923,15 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -940,11 +939,11 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -952,14 +951,14 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -967,10 +966,10 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 @@ -979,10 +978,9 @@ ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_one_v8f16: @@ -1000,8 +998,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ogt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1010,15 +1008,15 @@ ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1026,76 +1024,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16: @@ -1112,8 +1109,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1122,15 +1119,15 @@ ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1138,76 +1135,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16: @@ -1224,8 +1220,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_olt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1234,15 +1230,15 @@ ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1250,76 +1246,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16: @@ -1336,8 +1331,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ole_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1346,15 +1341,15 @@ ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1362,76 +1357,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16: @@ -1448,8 +1442,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ueq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1461,16 +1455,16 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: vmovx.f16 s18, s12 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1478,11 +1472,10 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1490,11 +1483,11 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1502,14 +1495,15 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1517,11 +1511,11 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1529,14 +1523,14 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1544,10 +1538,10 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 @@ -1556,10 +1550,9 @@ ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16: @@ -1577,8 +1570,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_une_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1587,15 +1580,15 @@ ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1603,76 +1596,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_une_v8f16: @@ -1689,8 +1681,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ugt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1699,15 +1691,15 @@ ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1715,76 +1707,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16: @@ -1801,8 +1792,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1811,15 +1802,15 @@ ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1827,76 +1818,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16: @@ -1913,8 +1903,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ult_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1923,15 +1913,15 @@ ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1939,76 +1929,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16: @@ -2025,8 +2014,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ule_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -2035,15 +2024,15 @@ ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -2051,76 +2040,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16: @@ -2137,8 +2125,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ord_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -2147,15 +2135,15 @@ ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -2163,76 +2151,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16: @@ -2250,8 +2237,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uno_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -2260,15 +2247,15 @@ ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -2276,76 +2263,75 @@ ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16: diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll @@ -801,8 +801,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oeq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -822,12 +820,12 @@ ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -839,17 +837,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -861,17 +859,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -882,17 +880,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16: @@ -912,8 +908,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_one_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -937,12 +931,12 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -956,19 +950,19 @@ ; CHECK-MVE-NEXT: vcmp.f16 s1, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -982,19 +976,19 @@ ; CHECK-MVE-NEXT: vcmp.f16 s2, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1007,7 +1001,7 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 @@ -1016,10 +1010,8 @@ ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_one_v8f16: @@ -1040,8 +1032,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ogt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1061,12 +1051,12 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1078,17 +1068,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1100,17 +1090,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1121,17 +1111,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16: @@ -1151,8 +1139,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1172,12 +1158,12 @@ ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1189,17 +1175,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1211,17 +1197,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1232,17 +1218,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16: @@ -1262,8 +1246,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_olt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1283,12 +1265,12 @@ ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1300,17 +1282,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1322,17 +1304,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1343,17 +1325,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16: @@ -1373,8 +1353,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ole_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1394,12 +1372,12 @@ ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1411,17 +1389,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1433,17 +1411,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1454,17 +1432,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16: @@ -1484,8 +1460,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ueq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1509,12 +1483,12 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1528,19 +1502,19 @@ ; CHECK-MVE-NEXT: vcmp.f16 s1, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1554,19 +1528,19 @@ ; CHECK-MVE-NEXT: vcmp.f16 s2, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1579,7 +1553,7 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 @@ -1588,10 +1562,8 @@ ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16: @@ -1612,8 +1584,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_une_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1633,12 +1603,12 @@ ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1650,17 +1620,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1672,17 +1642,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1693,17 +1663,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_une_v8f16: @@ -1723,8 +1691,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ugt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1744,12 +1710,12 @@ ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1761,17 +1727,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1783,17 +1749,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1804,17 +1770,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16: @@ -1834,8 +1798,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1855,12 +1817,12 @@ ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1872,17 +1834,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1894,17 +1856,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1915,17 +1877,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16: @@ -1945,8 +1905,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ult_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1966,12 +1924,12 @@ ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1983,17 +1941,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -2005,17 +1963,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -2026,17 +1984,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16: @@ -2056,8 +2012,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ule_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -2077,12 +2031,12 @@ ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -2094,17 +2048,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -2116,17 +2070,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -2137,17 +2091,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16: @@ -2167,8 +2119,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ord_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -2188,12 +2138,12 @@ ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -2205,17 +2155,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -2227,17 +2177,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -2248,17 +2198,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16: @@ -2279,8 +2227,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uno_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -2300,12 +2246,12 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -2317,17 +2263,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -2339,17 +2285,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -2360,17 +2306,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16: @@ -3190,8 +3134,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_oeq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3211,12 +3153,12 @@ ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3228,17 +3170,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3250,17 +3192,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3271,17 +3213,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16: @@ -3301,8 +3241,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_one_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3326,12 +3264,12 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3345,19 +3283,19 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3371,19 +3309,19 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3396,7 +3334,7 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 @@ -3405,10 +3343,8 @@ ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16: @@ -3429,8 +3365,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ogt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3450,12 +3384,12 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3467,17 +3401,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3489,17 +3423,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3510,17 +3444,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16: @@ -3540,8 +3472,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_oge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3561,12 +3491,12 @@ ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3578,17 +3508,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3600,17 +3530,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3621,17 +3551,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16: @@ -3651,8 +3579,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_olt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3672,12 +3598,12 @@ ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3689,17 +3615,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3711,17 +3637,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3732,17 +3658,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16: @@ -3762,8 +3686,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ole_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3783,12 +3705,12 @@ ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3800,17 +3722,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3822,17 +3744,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3843,17 +3765,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16: @@ -3873,8 +3793,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ueq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3898,12 +3816,12 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3917,19 +3835,19 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3943,19 +3861,19 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3968,7 +3886,7 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 @@ -3977,10 +3895,8 @@ ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16: @@ -4001,8 +3917,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_une_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4022,12 +3936,12 @@ ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4039,17 +3953,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4061,17 +3975,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4082,17 +3996,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16: @@ -4112,8 +4024,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ugt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4133,12 +4043,12 @@ ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4150,17 +4060,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4172,17 +4082,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4193,17 +4103,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16: @@ -4223,8 +4131,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_uge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4244,12 +4150,12 @@ ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4261,17 +4167,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4283,17 +4189,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4304,17 +4210,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16: @@ -4334,8 +4238,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ult_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4355,12 +4257,12 @@ ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4372,17 +4274,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4394,17 +4296,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4415,17 +4317,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16: @@ -4445,8 +4345,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ule_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4466,12 +4364,12 @@ ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4483,17 +4381,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4505,17 +4403,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4526,17 +4424,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16: @@ -4556,8 +4452,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ord_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4577,12 +4471,12 @@ ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4594,17 +4488,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4616,17 +4510,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4637,17 +4531,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16: @@ -4668,8 +4560,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_uno_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4689,12 +4579,12 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4706,17 +4596,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4728,17 +4618,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4749,17 +4639,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16: @@ -4782,8 +4670,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oeq_v8f16_bc: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -4803,12 +4689,12 @@ ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4820,17 +4706,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4842,17 +4728,17 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4863,17 +4749,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16_bc: diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll @@ -759,8 +759,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oeq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -774,43 +772,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -819,20 +817,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -840,17 +838,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16: @@ -867,8 +863,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_one_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -884,7 +878,7 @@ ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -892,15 +886,13 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -908,25 +900,27 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -937,22 +931,22 @@ ; CHECK-MVE-NEXT: vcmp.f16 s2, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -962,7 +956,7 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 @@ -971,10 +965,8 @@ ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_one_v8f16: @@ -992,8 +984,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ogt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1007,43 +997,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 @@ -1052,20 +1042,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 @@ -1073,17 +1063,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16: @@ -1100,8 +1088,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1115,43 +1101,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 @@ -1160,20 +1146,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 @@ -1181,17 +1167,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16: @@ -1208,8 +1192,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_olt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1223,43 +1205,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -1268,20 +1250,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -1289,17 +1271,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16: @@ -1316,8 +1296,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ole_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1331,43 +1309,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 @@ -1376,20 +1354,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 @@ -1397,17 +1375,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16: @@ -1424,8 +1400,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ueq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1441,7 +1415,7 @@ ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1449,15 +1423,13 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1465,25 +1437,27 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1494,22 +1468,22 @@ ; CHECK-MVE-NEXT: vcmp.f16 s2, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1519,7 +1493,7 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 @@ -1528,10 +1502,8 @@ ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16: @@ -1549,8 +1521,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_une_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1564,43 +1534,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 @@ -1609,20 +1579,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 @@ -1630,17 +1600,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_une_v8f16: @@ -1657,8 +1625,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ugt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1672,43 +1638,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 @@ -1717,20 +1683,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 @@ -1738,17 +1704,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16: @@ -1765,8 +1729,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1780,43 +1742,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 @@ -1825,20 +1787,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 @@ -1846,17 +1808,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16: @@ -1873,8 +1833,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ult_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1888,43 +1846,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 @@ -1933,20 +1891,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 @@ -1954,17 +1912,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16: @@ -1981,8 +1937,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ule_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1996,43 +1950,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 @@ -2041,20 +1995,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 @@ -2062,17 +2016,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16: @@ -2089,8 +2041,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ord_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, s12 @@ -2104,43 +2054,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, s0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 @@ -2149,20 +2099,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 @@ -2170,17 +2120,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16: @@ -2198,8 +2146,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uno_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, s12 @@ -2213,43 +2159,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, s0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 @@ -2258,20 +2204,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 @@ -2279,17 +2225,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16: @@ -3064,8 +3008,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_oeq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3079,43 +3021,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3124,20 +3066,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3145,17 +3087,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16: @@ -3172,8 +3112,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_one_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3189,7 +3127,7 @@ ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3197,15 +3135,13 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3213,25 +3149,27 @@ ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3242,22 +3180,22 @@ ; CHECK-MVE-NEXT: vcmp.f16 s2, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3267,7 +3205,7 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 @@ -3276,10 +3214,8 @@ ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16: @@ -3297,8 +3233,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ogt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3312,43 +3246,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3357,20 +3291,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3378,17 +3312,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16: @@ -3405,8 +3337,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_oge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3420,43 +3350,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 @@ -3465,20 +3395,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 @@ -3486,17 +3416,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16: @@ -3513,8 +3441,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_olt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3528,43 +3454,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 @@ -3573,20 +3499,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 @@ -3594,17 +3520,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16: @@ -3621,8 +3545,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ole_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3636,43 +3558,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 @@ -3681,20 +3603,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 @@ -3702,17 +3624,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16: @@ -3729,8 +3649,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ueq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3746,7 +3664,7 @@ ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3754,15 +3672,13 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3770,25 +3686,27 @@ ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3799,22 +3717,22 @@ ; CHECK-MVE-NEXT: vcmp.f16 s2, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3824,7 +3742,7 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 @@ -3833,10 +3751,8 @@ ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16: @@ -3854,8 +3770,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_une_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3869,43 +3783,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 @@ -3914,20 +3828,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 @@ -3935,17 +3849,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16: @@ -3962,8 +3874,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ugt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3977,43 +3887,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 @@ -4022,20 +3932,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 @@ -4043,17 +3953,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16: @@ -4070,8 +3978,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_uge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -4085,43 +3991,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 @@ -4130,20 +4036,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 @@ -4151,17 +4057,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16: @@ -4178,8 +4082,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ult_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -4193,43 +4095,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 @@ -4238,20 +4140,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 @@ -4259,17 +4161,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16: @@ -4286,8 +4186,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ule_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -4301,43 +4199,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 @@ -4346,20 +4244,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 @@ -4367,17 +4265,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16: @@ -4394,8 +4290,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ord_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, s12 @@ -4409,43 +4303,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, s0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 @@ -4454,20 +4348,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 @@ -4475,17 +4369,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16: @@ -4503,8 +4395,6 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_uno_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, s12 @@ -4518,43 +4408,43 @@ ; CHECK-MVE-NEXT: vcmp.f16 s0, s0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 @@ -4563,20 +4453,20 @@ ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 @@ -4584,17 +4474,15 @@ ; CHECK-MVE-NEXT: vcmp.f16 s3, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16: diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll @@ -5,11 +5,10 @@ define arm_aapcs_vfpcc <4 x float> @foo_float_int32(<4 x i32> %src) { ; CHECK-MVE-LABEL: foo_float_int32: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcvt.f32.s32 s7, s3 -; CHECK-MVE-NEXT: vcvt.f32.s32 s6, s2 -; CHECK-MVE-NEXT: vcvt.f32.s32 s5, s1 -; CHECK-MVE-NEXT: vcvt.f32.s32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vcvt.f32.s32 s3, s3 +; CHECK-MVE-NEXT: vcvt.f32.s32 s2, s2 +; CHECK-MVE-NEXT: vcvt.f32.s32 s1, s1 +; CHECK-MVE-NEXT: vcvt.f32.s32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: foo_float_int32: @@ -24,11 +23,10 @@ define arm_aapcs_vfpcc <4 x float> @foo_float_uint32(<4 x i32> %src) { ; CHECK-MVE-LABEL: foo_float_uint32: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcvt.f32.u32 s7, s3 -; CHECK-MVE-NEXT: vcvt.f32.u32 s6, s2 -; CHECK-MVE-NEXT: vcvt.f32.u32 s5, s1 -; CHECK-MVE-NEXT: vcvt.f32.u32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vcvt.f32.u32 s3, s3 +; CHECK-MVE-NEXT: vcvt.f32.u32 s2, s2 +; CHECK-MVE-NEXT: vcvt.f32.u32 s1, s1 +; CHECK-MVE-NEXT: vcvt.f32.u32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: foo_float_uint32: @@ -43,15 +41,15 @@ define arm_aapcs_vfpcc <4 x i32> @foo_int32_float(<4 x float> %src) { ; CHECK-MVE-LABEL: foo_int32_float: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcvt.s32.f32 s4, s2 -; CHECK-MVE-NEXT: vcvt.s32.f32 s6, s0 -; CHECK-MVE-NEXT: vcvt.s32.f32 s8, s3 -; CHECK-MVE-NEXT: vcvt.s32.f32 s10, s1 +; CHECK-MVE-NEXT: vcvt.s32.f32 s2, s2 +; CHECK-MVE-NEXT: vcvt.s32.f32 s0, s0 +; CHECK-MVE-NEXT: vcvt.s32.f32 s4, s3 +; CHECK-MVE-NEXT: vcvt.s32.f32 s6, s1 +; CHECK-MVE-NEXT: vmov r0, s2 +; CHECK-MVE-NEXT: vmov r1, s0 +; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0 ; CHECK-MVE-NEXT: vmov r0, s4 ; CHECK-MVE-NEXT: vmov r1, s6 -; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-MVE-NEXT: vmov r0, s8 -; CHECK-MVE-NEXT: vmov r1, s10 ; CHECK-MVE-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-MVE-NEXT: bx lr ; @@ -67,15 +65,15 @@ define arm_aapcs_vfpcc <4 x i32> @foo_uint32_float(<4 x float> %src) { ; CHECK-MVE-LABEL: foo_uint32_float: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcvt.u32.f32 s4, s2 -; CHECK-MVE-NEXT: vcvt.u32.f32 s6, s0 -; CHECK-MVE-NEXT: vcvt.u32.f32 s8, s3 -; CHECK-MVE-NEXT: vcvt.u32.f32 s10, s1 +; CHECK-MVE-NEXT: vcvt.u32.f32 s2, s2 +; CHECK-MVE-NEXT: vcvt.u32.f32 s0, s0 +; CHECK-MVE-NEXT: vcvt.u32.f32 s4, s3 +; CHECK-MVE-NEXT: vcvt.u32.f32 s6, s1 +; CHECK-MVE-NEXT: vmov r0, s2 +; CHECK-MVE-NEXT: vmov r1, s0 +; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0 ; CHECK-MVE-NEXT: vmov r0, s4 ; CHECK-MVE-NEXT: vmov r1, s6 -; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-MVE-NEXT: vmov r0, s8 -; CHECK-MVE-NEXT: vmov r1, s10 ; CHECK-MVE-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-MVE-NEXT: bx lr ; @@ -96,28 +94,28 @@ ; CHECK-MVE-NEXT: vmov s0, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[1] ; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[3] -; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s2 ; CHECK-MVE-NEXT: vcvt.f16.s32 s0, s0 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmov s8, r0 +; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2 +; CHECK-MVE-NEXT: vmov.s16 r0, q1[3] +; CHECK-MVE-NEXT: vins.f16 s0, s2 +; CHECK-MVE-NEXT: vmov s2, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[2] -; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 -; CHECK-MVE-NEXT: vmov s10, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[4] -; CHECK-MVE-NEXT: vcvt.f16.s32 s1, s10 -; CHECK-MVE-NEXT: vins.f16 s1, s8 +; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2 ; CHECK-MVE-NEXT: vmov s8, r0 +; CHECK-MVE-NEXT: vmov.s16 r0, q1[4] +; CHECK-MVE-NEXT: vcvt.f16.s32 s1, s8 +; CHECK-MVE-NEXT: vins.f16 s1, s2 +; CHECK-MVE-NEXT: vmov s2, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[5] -; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s8 -; CHECK-MVE-NEXT: vmov s10, r0 +; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2 +; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[7] +; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 +; CHECK-MVE-NEXT: vins.f16 s2, s8 ; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[6] -; CHECK-MVE-NEXT: vcvt.f16.s32 s10, s10 -; CHECK-MVE-NEXT: vmov s4, r0 -; CHECK-MVE-NEXT: vins.f16 s2, s10 ; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 +; CHECK-MVE-NEXT: vmov s4, r0 ; CHECK-MVE-NEXT: vcvt.f16.s32 s3, s4 ; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr @@ -139,28 +137,28 @@ ; CHECK-MVE-NEXT: vmov s0, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[1] ; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[3] -; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s2 ; CHECK-MVE-NEXT: vcvt.f16.u32 s0, s0 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmov s8, r0 +; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[3] +; CHECK-MVE-NEXT: vins.f16 s0, s2 +; CHECK-MVE-NEXT: vmov s2, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[2] -; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 -; CHECK-MVE-NEXT: vmov s10, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[4] -; CHECK-MVE-NEXT: vcvt.f16.u32 s1, s10 -; CHECK-MVE-NEXT: vins.f16 s1, s8 +; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2 ; CHECK-MVE-NEXT: vmov s8, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[4] +; CHECK-MVE-NEXT: vcvt.f16.u32 s1, s8 +; CHECK-MVE-NEXT: vins.f16 s1, s2 +; CHECK-MVE-NEXT: vmov s2, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[5] -; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s8 -; CHECK-MVE-NEXT: vmov s10, r0 +; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2 +; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[7] +; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 +; CHECK-MVE-NEXT: vins.f16 s2, s8 ; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[6] -; CHECK-MVE-NEXT: vcvt.f16.u32 s10, s10 -; CHECK-MVE-NEXT: vmov s4, r0 -; CHECK-MVE-NEXT: vins.f16 s2, s10 ; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 +; CHECK-MVE-NEXT: vmov s4, r0 ; CHECK-MVE-NEXT: vcvt.f16.u32 s3, s4 ; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr @@ -177,15 +175,15 @@ define arm_aapcs_vfpcc <8 x i16> @foo_int16_half(<8 x half> %src) { ; CHECK-MVE-LABEL: foo_int16_half: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmovx.f16 s14, s0 +; CHECK-MVE-NEXT: vmovx.f16 s6, s2 +; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2 +; CHECK-MVE-NEXT: vmovx.f16 s2, s0 ; CHECK-MVE-NEXT: vcvt.s32.f16 s0, s0 -; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s14 +; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s2 ; CHECK-MVE-NEXT: vmov r0, s0 ; CHECK-MVE-NEXT: vmovx.f16 s4, s3 -; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vmovx.f16 s10, s1 ; CHECK-MVE-NEXT: vcvt.s32.f16 s8, s3 -; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2 ; CHECK-MVE-NEXT: vcvt.s32.f16 s5, s1 ; CHECK-MVE-NEXT: vmov.16 q0[0], r0 ; CHECK-MVE-NEXT: vmov r0, s14 @@ -219,15 +217,15 @@ define arm_aapcs_vfpcc <8 x i16> @foo_uint16_half(<8 x half> %src) { ; CHECK-MVE-LABEL: foo_uint16_half: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmovx.f16 s14, s0 +; CHECK-MVE-NEXT: vmovx.f16 s6, s2 +; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2 +; CHECK-MVE-NEXT: vmovx.f16 s2, s0 ; CHECK-MVE-NEXT: vcvt.s32.f16 s0, s0 -; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s14 +; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s2 ; CHECK-MVE-NEXT: vmov r0, s0 ; CHECK-MVE-NEXT: vmovx.f16 s4, s3 -; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vmovx.f16 s10, s1 ; CHECK-MVE-NEXT: vcvt.s32.f16 s8, s3 -; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2 ; CHECK-MVE-NEXT: vcvt.s32.f16 s5, s1 ; CHECK-MVE-NEXT: vmov.16 q0[0], r0 ; CHECK-MVE-NEXT: vmov r0, s14 @@ -355,14 +353,13 @@ define arm_aapcs_vfpcc <8 x half> @vmovn32_trunc1(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: vmovn32_trunc1: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s9 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s3 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s10 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s11 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 ; CHECK-MVE-NEXT: bx lr ; @@ -380,15 +377,14 @@ define arm_aapcs_vfpcc <8 x half> @vmovn32_trunc2(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: vmovn32_trunc2: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s1 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s2 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s3 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s4 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s5 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s6 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s7 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vmovn32_trunc2: diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -4,11 +4,10 @@ define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) { ; CHECK-LABEL: fpext_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-NEXT: vcvtb.f32.f16 s4, s0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-NEXT: bx lr entry: %out = fpext <4 x half> %src1 to <4 x float> @@ -19,12 +18,12 @@ ; CHECK-LABEL: fpext_8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vcvtt.f32.f16 s11, s1 -; CHECK-NEXT: vcvtt.f32.f16 s7, s3 ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s3 ; CHECK-NEXT: vcvtt.f32.f16 s9, s0 -; CHECK-NEXT: vcvtt.f32.f16 s5, s2 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 +; CHECK-NEXT: vcvtt.f32.f16 s7, s3 +; CHECK-NEXT: vcvtb.f32.f16 s6, s3 +; CHECK-NEXT: vcvtt.f32.f16 s5, s2 ; CHECK-NEXT: vcvtb.f32.f16 s4, s2 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr @@ -37,11 +36,10 @@ define arm_aapcs_vfpcc <4 x half> @fptrunc_4(<4 x float> %src1) { ; CHECK-LABEL: fptrunc_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 s4, s0 -; CHECK-NEXT: vcvtt.f16.f32 s4, s1 -; CHECK-NEXT: vcvtb.f16.f32 s5, s2 -; CHECK-NEXT: vcvtt.f16.f32 s5, s3 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-NEXT: vcvtt.f16.f32 s0, s1 +; CHECK-NEXT: vcvtt.f16.f32 s1, s3 ; CHECK-NEXT: bx lr entry: %out = fptrunc <4 x float> %src1 to <4 x half> @@ -51,15 +49,14 @@ define arm_aapcs_vfpcc <8 x half> @fptrunc_8(<8 x float> %src1) { ; CHECK-LABEL: fptrunc_8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s8 -; CHECK-NEXT: vcvtt.f16.f32 s0, s9 -; CHECK-NEXT: vcvtb.f16.f32 s1, s10 -; CHECK-NEXT: vcvtt.f16.f32 s1, s11 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s2 ; CHECK-NEXT: vcvtb.f16.f32 s2, s4 +; CHECK-NEXT: vcvtt.f16.f32 s0, s1 +; CHECK-NEXT: vcvtt.f16.f32 s1, s3 ; CHECK-NEXT: vcvtt.f16.f32 s2, s5 -; CHECK-NEXT: vcvtb.f16.f32 s3, s6 ; CHECK-NEXT: vcvtt.f16.f32 s3, s7 +; CHECK-NEXT: vcvtb.f16.f32 s4, s6 ; CHECK-NEXT: bx lr entry: %out = fptrunc <8 x float> %src1 to <8 x half> @@ -247,12 +244,12 @@ ; CHECK-NEXT: vld20.16 {q2, q3}, [r0] ; CHECK-NEXT: vld21.16 {q2, q3}, [r0] ; CHECK-NEXT: vcvtt.f32.f16 s3, s9 -; CHECK-NEXT: vcvtt.f32.f16 s7, s11 ; CHECK-NEXT: vcvtb.f32.f16 s2, s9 -; CHECK-NEXT: vcvtb.f32.f16 s6, s11 ; CHECK-NEXT: vcvtt.f32.f16 s1, s8 -; CHECK-NEXT: vcvtt.f32.f16 s5, s10 ; CHECK-NEXT: vcvtb.f32.f16 s0, s8 +; CHECK-NEXT: vcvtt.f32.f16 s7, s11 +; CHECK-NEXT: vcvtb.f32.f16 s6, s11 +; CHECK-NEXT: vcvtt.f32.f16 s5, s10 ; CHECK-NEXT: vcvtb.f32.f16 s4, s10 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll --- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -136,7 +136,6 @@ define arm_aapcs_vfpcc <2 x double> @vdup_f64(double %src) { ; CHECK-LABEL: vdup_f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: vmov.f32 s2, s0 ; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll @@ -16,17 +16,17 @@ define arm_aapcs_vfpcc float @fadd_v4f32(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fadd_v4f32: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vadd.f32 s6, s2, s3 +; CHECK-FP-NEXT: vadd.f32 s2, s2, s3 ; CHECK-FP-NEXT: vadd.f32 s0, s0, s1 -; CHECK-FP-NEXT: vadd.f32 s0, s0, s6 +; CHECK-FP-NEXT: vadd.f32 s0, s0, s2 ; CHECK-FP-NEXT: vadd.f32 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fadd_v4f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vadd.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vadd.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vadd.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vadd.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -38,21 +38,21 @@ ; CHECK-FP-LABEL: fadd_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vadd.f32 q0, q0, q1 -; CHECK-FP-NEXT: vadd.f32 s4, s2, s3 +; CHECK-FP-NEXT: vadd.f32 s2, s2, s3 ; CHECK-FP-NEXT: vadd.f32 s0, s0, s1 -; CHECK-FP-NEXT: vadd.f32 s0, s0, s4 +; CHECK-FP-NEXT: vadd.f32 s0, s0, s2 ; CHECK-FP-NEXT: vadd.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fadd_v8f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vadd.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vadd.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vadd.f32 s14, s2, s6 -; CHECK-NOFP-NEXT: vadd.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vadd.f32 s10, s12, s10 -; CHECK-NOFP-NEXT: vadd.f32 s2, s10, s14 -; CHECK-NOFP-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vadd.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vadd.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vadd.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -63,8 +63,8 @@ define arm_aapcs_vfpcc half @fadd_v2f16(<2 x half> %x, half %y) { ; CHECK-LABEL: fadd_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vadd.f16 s0, s0, s6 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NEXT: vadd.f16 s0, s4, s0 ; CHECK-NEXT: bx lr entry: @@ -75,21 +75,21 @@ define arm_aapcs_vfpcc half @fadd_v4f16(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fadd_v4f16: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vadd.f16 s6, s1, s6 -; CHECK-FP-NEXT: vadd.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vadd.f16 s2, s1, s2 ; CHECK-FP-NEXT: vadd.f16 s0, s0, s6 +; CHECK-FP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-FP-NEXT: vadd.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fadd_v4f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vadd.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vadd.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -102,25 +102,25 @@ ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vadd.f16 q0, q0, q2 -; CHECK-FP-NEXT: vadd.f16 s6, s2, s3 +; CHECK-FP-NEXT: vadd.f16 s2, s2, s3 ; CHECK-FP-NEXT: vadd.f16 s0, s0, s1 -; CHECK-FP-NEXT: vadd.f16 s0, s0, s6 +; CHECK-FP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-FP-NEXT: vadd.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fadd_v8f16: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vadd.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vadd.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -134,37 +134,37 @@ ; CHECK-FP-NEXT: vadd.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vadd.f16 q0, q0, q1 -; CHECK-FP-NEXT: vadd.f16 s4, s2, s3 +; CHECK-FP-NEXT: vadd.f16 s2, s2, s3 ; CHECK-FP-NEXT: vadd.f16 s0, s0, s1 -; CHECK-FP-NEXT: vadd.f16 s0, s0, s4 +; CHECK-FP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-FP-NEXT: vadd.f16 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fadd_v16f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vadd.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vadd.f16 s12, s0, s4 -; CHECK-NOFP-NEXT: vadd.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vadd.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vadd.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vadd.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vadd.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vadd.f16 s12, s3, s7 -; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vadd.f16 s4, s1, s5 ; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vadd.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vadd.f16 s4, s10, s4 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vadd.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vadd.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vadd.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vadd.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -196,9 +196,9 @@ define arm_aapcs_vfpcc double @fadd_v4f64(<4 x double> %x, double %y) { ; CHECK-LABEL: fadd_v4f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f64 d5, d1, d3 +; CHECK-NEXT: vadd.f64 d1, d1, d3 ; CHECK-NEXT: vadd.f64 d0, d0, d2 -; CHECK-NEXT: vadd.f64 d0, d0, d5 +; CHECK-NEXT: vadd.f64 d0, d0, d1 ; CHECK-NEXT: vadd.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: @@ -209,8 +209,8 @@ define arm_aapcs_vfpcc float @fadd_v2f32_nofast(<2 x float> %x, float %y) { ; CHECK-LABEL: fadd_v2f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f32 s4, s4, s0 -; CHECK-NEXT: vadd.f32 s0, s4, s1 +; CHECK-NEXT: vadd.f32 s0, s4, s0 +; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x) @@ -220,10 +220,10 @@ define arm_aapcs_vfpcc float @fadd_v4f32_nofast(<4 x float> %x, float %y) { ; CHECK-LABEL: fadd_v4f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f32 s4, s4, s0 -; CHECK-NEXT: vadd.f32 s4, s4, s1 -; CHECK-NEXT: vadd.f32 s4, s4, s2 -; CHECK-NEXT: vadd.f32 s0, s4, s3 +; CHECK-NEXT: vadd.f32 s0, s4, s0 +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s0, s0, s3 ; CHECK-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x) @@ -233,10 +233,10 @@ define arm_aapcs_vfpcc float @fadd_v8f32_nofast(<8 x float> %x, float %y) { ; CHECK-LABEL: fadd_v8f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f32 s8, s8, s0 -; CHECK-NEXT: vadd.f32 s8, s8, s1 -; CHECK-NEXT: vadd.f32 s8, s8, s2 -; CHECK-NEXT: vadd.f32 s0, s8, s3 +; CHECK-NEXT: vadd.f32 s0, s8, s0 +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s0, s0, s3 ; CHECK-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NEXT: vadd.f32 s0, s0, s5 ; CHECK-NEXT: vadd.f32 s0, s0, s6 @@ -250,12 +250,12 @@ define arm_aapcs_vfpcc half @fadd_v4f16_nofast(<4 x half> %x, half %y) { ; CHECK-LABEL: fadd_v4f16_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f16 s4, s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vadd.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vadd.f16 s4, s4, s1 -; CHECK-NEXT: vadd.f16 s0, s4, s0 +; CHECK-NEXT: vadd.f16 s2, s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vadd.f16 s0, s2, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x) @@ -266,17 +266,17 @@ ; CHECK-LABEL: fadd_v8f16_nofast: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vadd.f16 s4, s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vadd.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vadd.f16 s4, s4, s1 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vadd.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vadd.f16 s4, s4, s2 -; CHECK-NEXT: vadd.f16 s4, s4, s6 -; CHECK-NEXT: vadd.f16 s4, s4, s3 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vadd.f16 s0, s4, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s4 +; CHECK-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vadd.f16 s0, s0, s3 +; CHECK-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x) @@ -287,18 +287,18 @@ ; CHECK-LABEL: fadd_v16f16_nofast: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vadd.f16 s8, s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s0 -; CHECK-NEXT: vadd.f16 s8, s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vadd.f16 s8, s8, s1 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vadd.f16 s8, s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s2 -; CHECK-NEXT: vadd.f16 s8, s8, s2 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vadd.f16 s8, s8, s10 -; CHECK-NEXT: vadd.f16 s8, s8, s3 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vadd.f16 s0, s8, s0 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s8 +; CHECK-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vadd.f16 s0, s0, s3 +; CHECK-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s4 ; CHECK-NEXT: vadd.f16 s0, s0, s4 ; CHECK-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NEXT: vmovx.f16 s2, s5 @@ -329,8 +329,8 @@ define arm_aapcs_vfpcc double @fadd_v2f64_nofast(<2 x double> %x, double %y) { ; CHECK-LABEL: fadd_v2f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f64 d2, d2, d0 -; CHECK-NEXT: vadd.f64 d0, d2, d1 +; CHECK-NEXT: vadd.f64 d0, d2, d0 +; CHECK-NEXT: vadd.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x) @@ -340,8 +340,8 @@ define arm_aapcs_vfpcc double @fadd_v4f64_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fadd_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f64 d4, d4, d0 -; CHECK-NEXT: vadd.f64 d0, d4, d1 +; CHECK-NEXT: vadd.f64 d0, d4, d0 +; CHECK-NEXT: vadd.f64 d0, d0, d1 ; CHECK-NEXT: vadd.f64 d0, d0, d2 ; CHECK-NEXT: vadd.f64 d0, d0, d3 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll @@ -15,16 +15,16 @@ define arm_aapcs_vfpcc float @fmin_v4f32(<4 x float> %x) { ; CHECK-FP-LABEL: fmin_v4f32: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s4, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s4, s4, s2 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x) @@ -36,9 +36,9 @@ ; CHECK-FP-LABEL: fmin_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f32: @@ -49,15 +49,15 @@ ; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s5 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vselgt.f32 s2, s2, s6 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f32 s2, s10, s8 -; CHECK-NOFP-NEXT: vminnm.f32 s2, s2, s12 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vselgt.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x) @@ -67,20 +67,20 @@ define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) { ; CHECK-FP-LABEL: fmin_v4f16: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s1 -; CHECK-FP-NEXT: vmovx.f16 s6, s0 -; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s4, s0 +; CHECK-FP-NEXT: vminnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x) @@ -92,24 +92,24 @@ ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x) @@ -122,9 +122,9 @@ ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16: @@ -132,42 +132,42 @@ ; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 ; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s8, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vcmp.f16 s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x) @@ -199,10 +199,10 @@ ; CHECK-NEXT: vcmp.f64 d3, d1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f64 d2, d0 -; CHECK-NEXT: vselgt.f64 d4, d1, d3 +; CHECK-NEXT: vselgt.f64 d1, d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vminnm.f64 d0, d0, d4 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x) @@ -222,16 +222,16 @@ define arm_aapcs_vfpcc float @fmin_v4f32_nofast(<4 x float> %x) { ; CHECK-FP-LABEL: fmin_v4f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s4, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s4, s4, s2 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x) @@ -242,20 +242,20 @@ ; CHECK-FP-LABEL: fmin_v8f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s10, s0, s4 ; CHECK-NOFP-NEXT: vminnm.f32 s8, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f32 s8, s10, s8 -; CHECK-NOFP-NEXT: vminnm.f32 s10, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f32 s8, s8, s10 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s8, s0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x) @@ -265,20 +265,20 @@ define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmin_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s1 -; CHECK-FP-NEXT: vmovx.f16 s6, s0 -; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s4, s0 +; CHECK-FP-NEXT: vminnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x) @@ -290,24 +290,24 @@ ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x) @@ -320,36 +320,36 @@ ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s1, s5 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s8, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x) @@ -378,9 +378,9 @@ define arm_aapcs_vfpcc double @fmin_v4f64_nofast(<4 x double> %x) { ; CHECK-LABEL: fmin_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vminnm.f64 d4, d1, d3 +; CHECK-NEXT: vminnm.f64 d1, d1, d3 ; CHECK-NEXT: vminnm.f64 d0, d0, d2 -; CHECK-NEXT: vminnm.f64 d0, d0, d4 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x) @@ -403,17 +403,17 @@ define arm_aapcs_vfpcc float @fmin_v4f32_acc(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v4f32_acc: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vminnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vminnm.f32 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f32_acc: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -427,9 +427,9 @@ ; CHECK-FP-LABEL: fmin_v8f32_acc: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vminnm.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -441,15 +441,15 @@ ; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vselgt.f32 s2, s2, s6 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f32 s2, s12, s10 -; CHECK-NOFP-NEXT: vminnm.f32 s2, s2, s14 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vselgt.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vminnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -462,21 +462,21 @@ define arm_aapcs_vfpcc half @fmin_v4f16_acc(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fmin_v4f16_acc: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vminnm.f16 s6, s1, s6 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16_acc: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -489,8 +489,8 @@ define arm_aapcs_vfpcc half @fmin_v2f16_acc(<2 x half> %x, half %y) { ; CHECK-LABEL: fmin_v2f16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NEXT: bx lr entry: @@ -505,25 +505,25 @@ ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q2 -; CHECK-FP-NEXT: vminnm.f16 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16_acc: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -539,9 +539,9 @@ ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -550,42 +550,42 @@ ; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 ; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s14 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s10 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s10, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s14 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vcmp.f16 s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -626,10 +626,10 @@ ; CHECK-NEXT: vcmp.f64 d3, d1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f64 d2, d0 -; CHECK-NEXT: vselgt.f64 d5, d1, d3 +; CHECK-NEXT: vselgt.f64 d1, d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vminnm.f64 d0, d0, d5 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: vminnm.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: @@ -657,9 +657,9 @@ define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v4f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vminnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f32 s0, s4 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 @@ -667,9 +667,9 @@ ; ; CHECK-NOFP-LABEL: fmin_v4f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -685,9 +685,9 @@ ; CHECK-FP-LABEL: fmin_v8f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f32 s0, s8 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0 @@ -695,13 +695,13 @@ ; ; CHECK-NOFP-LABEL: fmin_v8f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s12, s0, s4 ; CHECK-NOFP-NEXT: vminnm.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f32 s10, s12, s10 -; CHECK-NOFP-NEXT: vminnm.f32 s12, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f32 s10, s10, s12 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s10, s0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vminnm.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 @@ -716,11 +716,11 @@ define arm_aapcs_vfpcc half @fmin_v4f16_acc_nofast(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vminnm.f16 s6, s1, s6 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s0, s4 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 @@ -728,11 +728,11 @@ ; ; CHECK-NOFP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s4, s0 @@ -749,9 +749,9 @@ ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q2 -; CHECK-FP-NEXT: vminnm.f16 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s0, s4 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 @@ -760,16 +760,16 @@ ; CHECK-NOFP-LABEL: fmin_v8f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s4, s0 @@ -787,9 +787,9 @@ ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s0, s8 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s8, s0 @@ -797,29 +797,29 @@ ; ; CHECK-NOFP-LABEL: fmin_v16f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s10, s4 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s8, s0 @@ -863,9 +863,9 @@ define arm_aapcs_vfpcc double @fmin_v4f64_acc_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmin_v4f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vminnm.f64 d5, d1, d3 +; CHECK-NEXT: vminnm.f64 d1, d1, d3 ; CHECK-NEXT: vminnm.f64 d0, d0, d2 -; CHECK-NEXT: vminnm.f64 d0, d0, d5 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: vcmp.f64 d0, d4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d4, d0 @@ -890,16 +890,16 @@ define arm_aapcs_vfpcc float @fmax_v4f32(<4 x float> %x) { ; CHECK-FP-LABEL: fmax_v4f32: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s4, s2 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x) @@ -910,9 +910,9 @@ ; CHECK-FP-LABEL: fmax_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f32: @@ -923,15 +923,15 @@ ; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s5 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vselgt.f32 s2, s2, s6 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s10, s8 -; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s2, s12 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vselgt.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x) @@ -941,20 +941,20 @@ define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) { ; CHECK-FP-LABEL: fmax_v4f16: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s1 -; CHECK-FP-NEXT: vmovx.f16 s6, s0 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x) @@ -966,24 +966,24 @@ ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x) @@ -996,9 +996,9 @@ ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16: @@ -1006,42 +1006,42 @@ ; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 ; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vcmp.f16 s8, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s8, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vcmp.f16 s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x) @@ -1073,10 +1073,10 @@ ; CHECK-NEXT: vcmp.f64 d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f64 d0, d2 -; CHECK-NEXT: vselgt.f64 d4, d1, d3 +; CHECK-NEXT: vselgt.f64 d1, d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vmaxnm.f64 d0, d0, d4 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x) @@ -1096,16 +1096,16 @@ define arm_aapcs_vfpcc float @fmax_v4f32_nofast(<4 x float> %x) { ; CHECK-FP-LABEL: fmax_v4f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s4, s2 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x) @@ -1116,20 +1116,20 @@ ; CHECK-FP-LABEL: fmax_v8f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s0, s4 ; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s10, s8 -; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s8, s10 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s8, s0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x) @@ -1139,20 +1139,20 @@ define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmax_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s1 -; CHECK-FP-NEXT: vmovx.f16 s6, s0 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x) @@ -1164,24 +1164,24 @@ ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x) @@ -1194,36 +1194,36 @@ ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s1, s5 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s8, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x) @@ -1252,9 +1252,9 @@ define arm_aapcs_vfpcc double @fmax_v4f64_nofast(<4 x double> %x) { ; CHECK-LABEL: fmax_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmaxnm.f64 d4, d1, d3 +; CHECK-NEXT: vmaxnm.f64 d1, d1, d3 ; CHECK-NEXT: vmaxnm.f64 d0, d0, d2 -; CHECK-NEXT: vmaxnm.f64 d0, d0, d4 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x) @@ -1277,17 +1277,17 @@ define arm_aapcs_vfpcc float @fmax_v4f32_acc(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v4f32_acc: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmaxnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f32_acc: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -1301,9 +1301,9 @@ ; CHECK-FP-LABEL: fmax_v8f32_acc: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -1315,15 +1315,15 @@ ; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vselgt.f32 s2, s2, s6 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s12, s10 -; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s2, s14 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vselgt.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -1336,8 +1336,8 @@ define arm_aapcs_vfpcc half @fmax_v2f16_acc(<2 x half> %x, half %y) { ; CHECK-LABEL: fmax_v2f16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NEXT: bx lr entry: @@ -1350,21 +1350,21 @@ define arm_aapcs_vfpcc half @fmax_v4f16_acc(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fmax_v4f16_acc: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vmaxnm.f16 s6, s1, s6 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16_acc: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -1379,25 +1379,25 @@ ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q2 -; CHECK-FP-NEXT: vmaxnm.f16 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16_acc: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -1413,9 +1413,9 @@ ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -1424,42 +1424,42 @@ ; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 ; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s14, s12 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vcmp.f16 s10, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s10, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s14, s12 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vcmp.f16 s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -1500,10 +1500,10 @@ ; CHECK-NEXT: vcmp.f64 d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f64 d0, d2 -; CHECK-NEXT: vselgt.f64 d5, d1, d3 +; CHECK-NEXT: vselgt.f64 d1, d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vmaxnm.f64 d0, d0, d5 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: vmaxnm.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: @@ -1531,9 +1531,9 @@ define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v4f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmaxnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f32 s4, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 @@ -1541,9 +1541,9 @@ ; ; CHECK-NOFP-LABEL: fmax_v4f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -1559,9 +1559,9 @@ ; CHECK-FP-LABEL: fmax_v8f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f32 s8, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0 @@ -1569,13 +1569,13 @@ ; ; CHECK-NOFP-LABEL: fmax_v8f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s0, s4 ; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s12, s10 -; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s10, s12 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s10, s0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 @@ -1590,11 +1590,11 @@ define arm_aapcs_vfpcc half @fmax_v4f16_acc_nofast(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vmaxnm.f16 s6, s1, s6 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s4, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 @@ -1602,11 +1602,11 @@ ; ; CHECK-NOFP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s4, s0 @@ -1623,9 +1623,9 @@ ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q2 -; CHECK-FP-NEXT: vmaxnm.f16 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s4, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 @@ -1634,16 +1634,16 @@ ; CHECK-NOFP-LABEL: fmax_v8f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s4, s0 @@ -1661,9 +1661,9 @@ ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s8, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s8, s0 @@ -1671,29 +1671,29 @@ ; ; CHECK-NOFP-LABEL: fmax_v16f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s10, s4 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s8, s0 @@ -1737,9 +1737,9 @@ define arm_aapcs_vfpcc double @fmax_v4f64_acc_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmax_v4f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmaxnm.f64 d5, d1, d3 +; CHECK-NEXT: vmaxnm.f64 d1, d1, d3 ; CHECK-NEXT: vmaxnm.f64 d0, d0, d2 -; CHECK-NEXT: vmaxnm.f64 d0, d0, d5 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: vcmp.f64 d4, d0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d4, d0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll @@ -16,17 +16,17 @@ define arm_aapcs_vfpcc float @fmul_v4f32(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmul_v4f32: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmul.f32 s6, s2, s3 +; CHECK-FP-NEXT: vmul.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmul.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmul.f32 s0, s0, s6 +; CHECK-FP-NEXT: vmul.f32 s0, s0, s2 ; CHECK-FP-NEXT: vmul.f32 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmul_v4f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmul.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vmul.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vmul.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vmul.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -38,21 +38,21 @@ ; CHECK-FP-LABEL: fmul_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmul.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmul.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmul.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmul.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmul.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmul.f32 s0, s0, s2 ; CHECK-FP-NEXT: vmul.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmul_v8f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmul.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmul.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vmul.f32 s14, s2, s6 -; CHECK-NOFP-NEXT: vmul.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vmul.f32 s10, s12, s10 -; CHECK-NOFP-NEXT: vmul.f32 s2, s10, s14 -; CHECK-NOFP-NEXT: vmul.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vmul.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vmul.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmul.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -63,8 +63,8 @@ define arm_aapcs_vfpcc half @fmul_v2f16(<2 x half> %x, half %y) { ; CHECK-LABEL: fmul_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vmul.f16 s0, s0, s6 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NEXT: vmul.f16 s0, s4, s0 ; CHECK-NEXT: bx lr entry: @@ -75,21 +75,21 @@ define arm_aapcs_vfpcc half @fmul_v4f16(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fmul_v4f16: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vmul.f16 s6, s1, s6 -; CHECK-FP-NEXT: vmul.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmul.f16 s2, s1, s2 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmul.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmul_v4f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmul.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmul.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -102,25 +102,25 @@ ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vmul.f16 q0, q0, q2 -; CHECK-FP-NEXT: vmul.f16 s6, s2, s3 +; CHECK-FP-NEXT: vmul.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmul.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmul.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmul_v8f16: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vmul.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vmul.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -134,37 +134,37 @@ ; CHECK-FP-NEXT: vmul.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmul.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmul.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmul.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmul.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmul.f16 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmul_v16f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmul.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmul.f16 s12, s0, s4 -; CHECK-NOFP-NEXT: vmul.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmul.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmul.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmul.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vmul.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmul.f16 s12, s3, s7 -; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vmul.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vmul.f16 s4, s10, s4 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmul.f16 s4, s2, s6 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vmul.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmul.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmul.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vmul.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmul.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -196,9 +196,9 @@ define arm_aapcs_vfpcc double @fmul_v4f64(<4 x double> %x, double %y) { ; CHECK-LABEL: fmul_v4f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f64 d5, d1, d3 +; CHECK-NEXT: vmul.f64 d1, d1, d3 ; CHECK-NEXT: vmul.f64 d0, d0, d2 -; CHECK-NEXT: vmul.f64 d0, d0, d5 +; CHECK-NEXT: vmul.f64 d0, d0, d1 ; CHECK-NEXT: vmul.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: @@ -209,8 +209,8 @@ define arm_aapcs_vfpcc float @fmul_v2f32_nofast(<2 x float> %x, float %y) { ; CHECK-LABEL: fmul_v2f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f32 s4, s4, s0 -; CHECK-NEXT: vmul.f32 s0, s4, s1 +; CHECK-NEXT: vmul.f32 s0, s4, s0 +; CHECK-NEXT: vmul.f32 s0, s0, s1 ; CHECK-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x) @@ -220,10 +220,10 @@ define arm_aapcs_vfpcc float @fmul_v4f32_nofast(<4 x float> %x, float %y) { ; CHECK-LABEL: fmul_v4f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f32 s4, s4, s0 -; CHECK-NEXT: vmul.f32 s4, s4, s1 -; CHECK-NEXT: vmul.f32 s4, s4, s2 -; CHECK-NEXT: vmul.f32 s0, s4, s3 +; CHECK-NEXT: vmul.f32 s0, s4, s0 +; CHECK-NEXT: vmul.f32 s0, s0, s1 +; CHECK-NEXT: vmul.f32 s0, s0, s2 +; CHECK-NEXT: vmul.f32 s0, s0, s3 ; CHECK-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x) @@ -233,10 +233,10 @@ define arm_aapcs_vfpcc float @fmul_v8f32_nofast(<8 x float> %x, float %y) { ; CHECK-LABEL: fmul_v8f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f32 s8, s8, s0 -; CHECK-NEXT: vmul.f32 s8, s8, s1 -; CHECK-NEXT: vmul.f32 s8, s8, s2 -; CHECK-NEXT: vmul.f32 s0, s8, s3 +; CHECK-NEXT: vmul.f32 s0, s8, s0 +; CHECK-NEXT: vmul.f32 s0, s0, s1 +; CHECK-NEXT: vmul.f32 s0, s0, s2 +; CHECK-NEXT: vmul.f32 s0, s0, s3 ; CHECK-NEXT: vmul.f32 s0, s0, s4 ; CHECK-NEXT: vmul.f32 s0, s0, s5 ; CHECK-NEXT: vmul.f32 s0, s0, s6 @@ -250,9 +250,9 @@ define arm_aapcs_vfpcc half @fmul_v2f16_nofast(<2 x half> %x, half %y) { ; CHECK-LABEL: fmul_v2f16_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f16 s4, s4, s0 +; CHECK-NEXT: vmul.f16 s2, s4, s0 ; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmul.f16 s0, s4, s0 +; CHECK-NEXT: vmul.f16 s0, s2, s0 ; CHECK-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x) @@ -262,12 +262,12 @@ define arm_aapcs_vfpcc half @fmul_v4f16_nofast(<4 x half> %x, half %y) { ; CHECK-LABEL: fmul_v4f16_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f16 s4, s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vmul.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmul.f16 s4, s4, s1 -; CHECK-NEXT: vmul.f16 s0, s4, s0 +; CHECK-NEXT: vmul.f16 s2, s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmul.f16 s0, s2, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x) @@ -278,17 +278,17 @@ ; CHECK-LABEL: fmul_v8f16_nofast: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.f16 s4, s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vmul.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmul.f16 s4, s4, s1 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmul.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vmul.f16 s4, s4, s2 -; CHECK-NEXT: vmul.f16 s4, s4, s6 -; CHECK-NEXT: vmul.f16 s4, s4, s3 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vmul.f16 s0, s4, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s4 +; CHECK-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vmul.f16 s0, s0, s3 +; CHECK-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x) @@ -299,18 +299,18 @@ ; CHECK-LABEL: fmul_v16f16_nofast: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.f16 s8, s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s0 -; CHECK-NEXT: vmul.f16 s8, s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vmul.f16 s8, s8, s1 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmul.f16 s8, s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s2 -; CHECK-NEXT: vmul.f16 s8, s8, s2 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vmul.f16 s8, s8, s10 -; CHECK-NEXT: vmul.f16 s8, s8, s3 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vmul.f16 s0, s8, s0 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s8 +; CHECK-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vmul.f16 s0, s0, s3 +; CHECK-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s4 ; CHECK-NEXT: vmul.f16 s0, s0, s4 ; CHECK-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NEXT: vmovx.f16 s2, s5 @@ -341,8 +341,8 @@ define arm_aapcs_vfpcc double @fmul_v2f64_nofast(<2 x double> %x, double %y) { ; CHECK-LABEL: fmul_v2f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f64 d2, d2, d0 -; CHECK-NEXT: vmul.f64 d0, d2, d1 +; CHECK-NEXT: vmul.f64 d0, d2, d0 +; CHECK-NEXT: vmul.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x) @@ -352,8 +352,8 @@ define arm_aapcs_vfpcc double @fmul_v4f64_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmul_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f64 d4, d4, d0 -; CHECK-NEXT: vmul.f64 d0, d4, d1 +; CHECK-NEXT: vmul.f64 d0, d4, d0 +; CHECK-NEXT: vmul.f64 d0, d0, d1 ; CHECK-NEXT: vmul.f64 d0, d0, d2 ; CHECK-NEXT: vmul.f64 d0, d0, d3 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -503,10 +503,10 @@ ; CHECK-NEXT: vadd.f32 q0, q1, q0 ; CHECK-NEXT: le lr, .LBB5_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vadd.f32 s4, s2, s3 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s0, s0, s4 +; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: beq .LBB5_9 ; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 @@ -601,10 +601,10 @@ ; CHECK-NEXT: vmul.f32 q0, q1, q0 ; CHECK-NEXT: le lr, .LBB6_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmul.f32 s4, s2, s3 +; CHECK-NEXT: vmul.f32 s2, s2, s3 ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: vmul.f32 s0, s0, s1 -; CHECK-NEXT: vmul.f32 s0, s0, s4 +; CHECK-NEXT: vmul.f32 s0, s0, s2 ; CHECK-NEXT: beq .LBB6_9 ; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 @@ -1464,9 +1464,9 @@ ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: le lr, .LBB15_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: beq .LBB15_9 ; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1 @@ -1567,9 +1567,9 @@ ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: le lr, .LBB16_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: beq .LBB16_9 ; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll --- a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -54,17 +54,17 @@ define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vrhadd_s32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s16, s6 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vmov.f32 s18, s5 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 @@ -77,27 +77,26 @@ ; CHECK-NEXT: adc.w r3, r2, r3, asr #31 ; CHECK-NEXT: adds r2, r1, #1 ; CHECK-NEXT: adc r1, r3, #0 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: adds r0, #1 ; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: asrs r2, r1, #31 ; CHECK-NEXT: adds r1, r1, r3 ; CHECK-NEXT: adc.w r3, r2, r3, asr #31 ; CHECK-NEXT: adds r2, r1, #1 ; CHECK-NEXT: adc r1, r3, #0 ; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: bx lr %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -153,17 +152,17 @@ define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vhadd_s32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s16, s6 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vmov.f32 s18, s5 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 @@ -173,22 +172,21 @@ ; CHECK-NEXT: asr.w r12, r1, #31 ; CHECK-NEXT: adc.w r1, r12, r3, asr #31 ; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: adds r2, r1, r3 ; CHECK-NEXT: asr.w r12, r1, #31 ; CHECK-NEXT: adc.w r1, r12, r3, asr #31 ; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: bx lr %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -255,10 +253,10 @@ ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.i64 q4, #0xffffffff -; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vand q2, q2, q4 +; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vand q3, q3, q4 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov r2, r3, d6 @@ -356,10 +354,10 @@ ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.i64 q4, #0xffffffff -; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vand q2, q2, q4 +; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vand q3, q3, q4 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov r2, r3, d6 @@ -498,23 +496,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} ; CHECK-NEXT: mov.w lr, #256 ; CHECK-NEXT: .LBB14_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov.f32 s18, s5 +; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: asrs r4, r3, #31 ; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: asr.w r4, r3, #31 ; CHECK-NEXT: adc.w r3, r4, r5, asr #31 ; CHECK-NEXT: vmov r5, s4 ; CHECK-NEXT: lsrl r12, r3, #1 @@ -523,24 +521,24 @@ ; CHECK-NEXT: asr.w r4, r3, #31 ; CHECK-NEXT: adc.w r3, r4, r5, asr #31 ; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov q3[2], q3[0], r6, r12 ; CHECK-NEXT: adds r4, r3, r5 ; CHECK-NEXT: asr.w r6, r3, #31 ; CHECK-NEXT: adc.w r3, r6, r5, asr #31 ; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: adds r6, r3, r5 ; CHECK-NEXT: asr.w r12, r3, #31 ; CHECK-NEXT: adc.w r3, r12, r5, asr #31 ; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q4[3], q4[1], r6, r4 -; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: vmov q3[3], q3[1], r6, r4 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 ; CHECK-NEXT: le lr, .LBB14_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: br label %vector.body @@ -677,10 +675,10 @@ ; CHECK-NEXT: vldrw.u32 q3, [r0], #16 ; CHECK-NEXT: vldrw.u32 q4, [r1], #16 ; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vmov.f32 s10, s19 ; CHECK-NEXT: vand q2, q2, q0 ; CHECK-NEXT: vmov r3, r5, d2 ; CHECK-NEXT: vmov r4, r6, d4 @@ -859,10 +857,10 @@ ; CHECK-NEXT: vldrw.u32 q3, [r1], #16 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16 ; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vmov.f32 s10, s19 ; CHECK-NEXT: vand q2, q2, q0 ; CHECK-NEXT: vmov r3, r12, d2 ; CHECK-NEXT: vmov r4, r5, d4 @@ -1049,10 +1047,10 @@ ; CHECK-NEXT: vldrw.u32 q3, [r1], #16 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16 ; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vmov.f32 s10, s19 ; CHECK-NEXT: vand q2, q2, q0 ; CHECK-NEXT: vmov r3, r12, d2 ; CHECK-NEXT: vmov r4, r5, d4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll @@ -70,15 +70,13 @@ ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0], #32 -; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov r4, r7, d4 +; CHECK-NEXT: vmov lr, r12, d3 ; CHECK-NEXT: vmov r2, r5, d0 -; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vmov r4, r7, d4 ; CHECK-NEXT: vmov r3, r6, d1 ; CHECK-NEXT: adds.w r3, r3, lr ; CHECK-NEXT: adc.w r6, r6, r12 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -77,12 +77,10 @@ ; CHECK-NEXT: vld20.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vld20.32 {q1, q2}, [r3] -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4 -; CHECK-NEXT: vld21.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vld21.32 {q5, q6}, [r0] ; CHECK-NEXT: vld21.32 {q1, q2}, [r3] ; CHECK-NEXT: vstrw.32 q3, [r1, #48] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 ; CHECK-NEXT: vadd.i32 q5, q5, q6 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vadd.i32 q1, q1, q2 @@ -102,14 +100,14 @@ define void @vld2_v4i32_align1(<8 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vld2_v4i32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vmov.f32 s8, s5 ; CHECK-NEXT: vmov.f32 s9, s7 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s7, s2 ; CHECK-NEXT: vadd.i32 q0, q1, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -207,25 +205,25 @@ ; CHECK-LABEL: vld2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vldrb.u8 q2, [r0, #16] ; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vldrb.u8 q2, [r0, #16] -; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vmovx.f16 s6, s3 ; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s5, s6 ; CHECK-NEXT: vmovx.f16 s6, s8 ; CHECK-NEXT: vins.f16 s6, s12 -; CHECK-NEXT: vmovx.f16 s12, s11 ; CHECK-NEXT: vmovx.f16 s7, s10 -; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vins.f16 s10, s11 ; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vins.f16 s7, s12 ; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vins.f16 s10, s11 ; CHECK-NEXT: vmov.f32 s3, s10 ; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -324,15 +322,13 @@ ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov r0, r4, d4 +; CHECK-NEXT: vmov lr, r12, d3 ; CHECK-NEXT: vmov r5, r6, d0 -; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vmov r0, r4, d4 ; CHECK-NEXT: vmov r3, r2, d1 ; CHECK-NEXT: adds.w r3, r3, lr ; CHECK-NEXT: adc.w r2, r2, r12 @@ -356,34 +352,30 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov lr, r12, d5 ; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: vmov.f32 s3, s9 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f64 d8, d5 +; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s18, s14 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r5, r6, d4 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov.f32 s19, s15 -; CHECK-NEXT: vmov.f32 s11, s13 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s13 ; CHECK-NEXT: vmov r0, r7, d8 -; CHECK-NEXT: vmov r5, r6, d4 ; CHECK-NEXT: adds.w lr, lr, r2 ; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, r4, d9 +; CHECK-NEXT: vmov r3, r4, d7 ; CHECK-NEXT: adds r0, r0, r5 ; CHECK-NEXT: adc.w r8, r6, r7 -; CHECK-NEXT: vmov r6, r5, d5 +; CHECK-NEXT: vmov r6, r5, d1 ; CHECK-NEXT: vmov r2, r7, d0 ; CHECK-NEXT: adds r3, r3, r6 ; CHECK-NEXT: adc.w r6, r5, r4 @@ -396,7 +388,7 @@ ; CHECK-NEXT: adc.w r0, r7, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 8 @@ -480,12 +472,10 @@ ; CHECK-NEXT: vld20.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vld20.32 {q1, q2}, [r3] -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4 -; CHECK-NEXT: vld21.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: vld21.32 {q5, q6}, [r0] ; CHECK-NEXT: vld21.32 {q1, q2}, [r3] ; CHECK-NEXT: vstrw.32 q3, [r1, #48] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 ; CHECK-NEXT: vadd.f32 q5, q5, q6 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vadd.f32 q1, q1, q2 @@ -505,14 +495,14 @@ define void @vld2_v4f32_align1(<8 x float> *%src, <4 x float> *%dst) { ; CHECK-LABEL: vld2_v4f32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vmov.f32 s8, s5 ; CHECK-NEXT: vmov.f32 s9, s7 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s7, s2 ; CHECK-NEXT: vadd.f32 q0, q1, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -535,11 +525,11 @@ ; CHECK-NEXT: ldr r0, [r0, #4] ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s4, s2 ; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vadd.f16 q0, q0, q2 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: bx lr @@ -556,14 +546,14 @@ ; CHECK-LABEL: vld2_v4f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vmovx.f16 s8, s1 ; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmovx.f16 s6, s3 ; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vins.f16 s5, s8 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s5, s6 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r0, r2, d0 @@ -620,25 +610,25 @@ ; CHECK-LABEL: vld2_v8f16_align1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vldrb.u8 q2, [r0, #16] ; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vldrb.u8 q2, [r0, #16] -; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vmovx.f16 s6, s3 ; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s5, s6 ; CHECK-NEXT: vmovx.f16 s6, s8 ; CHECK-NEXT: vins.f16 s6, s12 -; CHECK-NEXT: vmovx.f16 s12, s11 ; CHECK-NEXT: vmovx.f16 s7, s10 +; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vins.f16 s10, s11 ; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vins.f16 s7, s12 +; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s3, s10 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -10,7 +10,6 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: ldrd r2, r0, [r0, #16] -; CHECK-NEXT: vmov.f64 d2, d0 ; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vmov r12, lr, d0 ; CHECK-NEXT: vmov r3, s6 @@ -37,20 +36,20 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vmov.f32 s3, s19 ; CHECK-NEXT: vadd.i32 q0, q2, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -72,37 +71,37 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vadd.i32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f64 d10, d4 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 ; CHECK-NEXT: vmov.f32 s23, s13 ; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s6, s12 ; CHECK-NEXT: vmov.f32 s7, s15 ; CHECK-NEXT: vadd.i32 q1, q4, q1 ; CHECK-NEXT: vstrw.32 q1, [r1] @@ -124,71 +123,71 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #80] ; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vadd.i32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f64 d10, d4 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s21, s11 ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 ; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160] ; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vadd.i32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s7, s15 ; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vadd.i32 q1, q4, q1 -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s21, s8 ; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vmov.f32 s21, s8 ; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vmov.f32 s23, s26 ; CHECK-NEXT: vmov.f32 s19, s25 ; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vmov.f32 s11, s27 +; CHECK-NEXT: vldrw.u32 q5, [r0, #128] ; CHECK-NEXT: vadd.i32 q2, q4, q2 ; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f64 d14, d8 -; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vmov.f32 s25, s12 -; CHECK-NEXT: vmov.f32 s29, s19 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s30, s14 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s24, s17 ; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s28, s16 +; CHECK-NEXT: vmov.f32 s29, s19 ; CHECK-NEXT: vmov.f32 s31, s21 ; CHECK-NEXT: vadd.i32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s14, s20 ; CHECK-NEXT: vmov.f32 s15, s23 ; CHECK-NEXT: vadd.i32 q3, q6, q3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] @@ -216,23 +215,22 @@ ; CHECK-NEXT: ldr r2, [r0, #8] ; CHECK-NEXT: mov r3, sp ; CHECK-NEXT: str r2, [sp] -; CHECK-NEXT: vmov.f64 d2, d0 ; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f64 d6, d1 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vldrh.u32 q1, [r3] -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov.f32 s6, s4 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strh r0, [r1, #2] ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strh r0, [r1] ; CHECK-NEXT: add sp, #8 @@ -292,49 +290,49 @@ ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vins.f16 s0, s8 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vins.f16 s1, s12 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vmovx.f16 s2, s6 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmovx.f16 s5, s5 +; CHECK-NEXT: vins.f16 s1, s2 ; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmovx.f16 s2, s15 ; CHECK-NEXT: vmov.f32 s18, s12 +; CHECK-NEXT: vins.f16 s19, s2 +; CHECK-NEXT: vmov.f32 s2, s11 ; CHECK-NEXT: vmov q5, q4 -; CHECK-NEXT: vmovnb.i32 q5, q0 -; CHECK-NEXT: vmov.f32 s2, s22 -; CHECK-NEXT: vmovx.f16 s20, s5 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov.f64 d8, d2 -; CHECK-NEXT: vins.f16 s16, s20 -; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vins.f16 s16, s5 +; CHECK-NEXT: vmovx.f16 s5, s8 ; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vins.f16 s17, s5 +; CHECK-NEXT: vmovx.f16 s5, s11 ; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vins.f16 s18, s20 -; CHECK-NEXT: vmovx.f16 s20, s14 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmovx.f16 s11, s13 +; CHECK-NEXT: vins.f16 s18, s5 +; CHECK-NEXT: vmovx.f16 s5, s7 +; CHECK-NEXT: vmovnb.i32 q5, q0 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmovx.f16 s14, s14 ; CHECK-NEXT: vmov.f32 s19, s13 -; CHECK-NEXT: vins.f16 s19, s20 -; CHECK-NEXT: vmovx.f16 s20, s4 -; CHECK-NEXT: vins.f16 s20, s6 -; CHECK-NEXT: vmovx.f16 s21, s7 -; CHECK-NEXT: vins.f16 s6, s12 -; CHECK-NEXT: vmovx.f16 s7, s13 -; CHECK-NEXT: vins.f16 s21, s9 -; CHECK-NEXT: vins.f16 s7, s15 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmovnb.i32 q2, q5 -; CHECK-NEXT: vmov.f32 s22, s10 -; CHECK-NEXT: vmov.f32 s23, s7 -; CHECK-NEXT: vadd.i16 q1, q4, q5 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vins.f16 s10, s12 +; CHECK-NEXT: vins.f16 s11, s15 +; CHECK-NEXT: vins.f16 s19, s14 +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmovnb.i32 q3, q1 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vmov.f32 s2, s22 +; CHECK-NEXT: vadd.i16 q1, q4, q1 ; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} @@ -355,103 +353,98 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f64 d0, d2 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vins.f16 s0, s8 ; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s8 ; CHECK-NEXT: vmov.f32 s1, s7 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmovx.f16 s16, s9 -; CHECK-NEXT: vins.f16 s1, s12 ; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vins.f16 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmovx.f16 s14, s18 +; CHECK-NEXT: vmov.f32 s3, s17 ; CHECK-NEXT: vins.f16 s2, s12 ; CHECK-NEXT: vmovx.f16 s12, s6 +; CHECK-NEXT: vins.f16 s3, s14 +; CHECK-NEXT: vmovx.f16 s14, s19 +; CHECK-NEXT: vins.f16 s18, s14 ; CHECK-NEXT: vins.f16 s5, s12 +; CHECK-NEXT: vmovx.f16 s12, s9 ; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vins.f16 s13, s16 -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vins.f16 s13, s12 ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmovx.f16 s20, s18 -; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vins.f16 s3, s20 -; CHECK-NEXT: vmovx.f16 s20, s19 -; CHECK-NEXT: vins.f16 s18, s20 +; CHECK-NEXT: vmovx.f16 s5, s7 +; CHECK-NEXT: vmov.u16 r2, q2[5] ; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmovx.f16 s11, s17 ; CHECK-NEXT: vmov.f32 s23, s18 ; CHECK-NEXT: vmov.f32 s22, s16 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vins.f16 s5, s9 ; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vins.f16 s10, s16 +; CHECK-NEXT: vins.f16 s11, s19 ; CHECK-NEXT: vmovnb.i32 q6, q3 -; CHECK-NEXT: vmov.f32 s14, s26 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vmovx.f16 s20, s4 -; CHECK-NEXT: vins.f16 s20, s6 -; CHECK-NEXT: vmovx.f16 s21, s7 -; CHECK-NEXT: vins.f16 s6, s16 -; CHECK-NEXT: vmovx.f16 s7, s17 -; CHECK-NEXT: vins.f16 s21, s9 -; CHECK-NEXT: vins.f16 s7, s19 -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmovnb.i32 q2, q5 -; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmovnb.i32 q4, q1 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s6, s18 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s23, s7 -; CHECK-NEXT: vadd.i16 q0, q0, q5 -; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vmovx.f16 s6, s10 ; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vins.f16 s4, s12 ; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmovx.f16 s16, s13 -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vins.f16 s5, s16 -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmovx.f16 s20, s19 +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vmovx.f16 s7, s19 ; CHECK-NEXT: vmov.f32 s27, s18 -; CHECK-NEXT: vins.f16 s27, s20 -; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vins.f16 s20, s0 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vins.f16 s27, s7 ; CHECK-NEXT: vmov.f32 s26, s16 -; CHECK-NEXT: vmovx.f16 s0, s12 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vins.f16 s21, s0 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmov.f32 s6, s15 ; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov.f32 s20, s8 ; CHECK-NEXT: vmovnb.i32 q7, q1 -; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vins.f16 s20, s6 +; CHECK-NEXT: vmovx.f16 s6, s12 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmovx.f16 s8, s8 +; CHECK-NEXT: vmovx.f16 s9, s11 +; CHECK-NEXT: vins.f16 s21, s6 +; CHECK-NEXT: vmovx.f16 s6, s15 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmovx.f16 s15, s17 ; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vins.f16 s22, s0 +; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vins.f16 s9, s13 +; CHECK-NEXT: vins.f16 s14, s16 +; CHECK-NEXT: vins.f16 s15, s19 +; CHECK-NEXT: vins.f16 s22, s6 +; CHECK-NEXT: vmovx.f16 s6, s18 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vins.f16 s23, s6 +; CHECK-NEXT: vmovnb.i32 q4, q2 +; CHECK-NEXT: vmov.f32 s11, s15 +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s6, s30 +; CHECK-NEXT: vadd.i16 q2, q5, q2 ; CHECK-NEXT: vmov.f32 s7, s27 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vins.f16 s24, s10 -; CHECK-NEXT: vins.f16 s23, s0 -; CHECK-NEXT: vins.f16 s2, s16 -; CHECK-NEXT: vmovx.f16 s25, s11 -; CHECK-NEXT: vmovx.f16 s3, s17 -; CHECK-NEXT: vins.f16 s25, s13 -; CHECK-NEXT: vins.f16 s3, s19 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmovnb.i32 q2, q6 -; CHECK-NEXT: vmov.f32 s26, s10 -; CHECK-NEXT: vmov.f32 s27, s3 -; CHECK-NEXT: vadd.i16 q0, q5, q6 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vadd.i16 q1, q2, q1 +; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -558,22 +551,21 @@ ; CHECK-NEXT: vmov.u8 r0, q0[0] ; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.16 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.16 q3[1], r0 ; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vmov.16 q3[2], r0 ; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] ; CHECK-NEXT: vmov.16 q3[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmov.16 q2[4], r2 ; CHECK-NEXT: vmov.16 q3[4], r0 ; CHECK-NEXT: vmov.u8 r0, q0[15] ; CHECK-NEXT: vmovx.f16 s16, s6 @@ -581,6 +573,7 @@ ; CHECK-NEXT: vmovx.f16 s11, s5 ; CHECK-NEXT: vmov.16 q3[5], r0 ; CHECK-NEXT: vins.f16 s18, s16 +; CHECK-NEXT: vins.f16 s10, s4 ; CHECK-NEXT: vins.f16 s11, s7 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.u8 r0, q0[2] @@ -653,12 +646,11 @@ ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] ; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.f32 s15, s19 ; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vmov.f32 s15, s19 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[3] ; CHECK-NEXT: vmov.8 q4[1], r0 @@ -681,19 +673,20 @@ ; CHECK-NEXT: vmov.u8 r0, q0[14] ; CHECK-NEXT: vmov.8 q4[10], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] +; CHECK-NEXT: vmov.f32 s14, s22 ; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[7] +; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vmov.8 q5[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[10] ; CHECK-NEXT: vmov.8 q5[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] ; CHECK-NEXT: vmov.8 q5[15], r0 ; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vmov.8 q6[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.f32 s18, s26 ; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.u8 r0, q1[2] ; CHECK-NEXT: vadd.i8 q3, q4, q3 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[5] @@ -753,19 +746,15 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f64 d6, d1 +; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov r5, r8, d6 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov r0, r3, d5 +; CHECK-NEXT: vmov r2, r4, d3 ; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: vmov lr, r12, d7 -; CHECK-NEXT: vmov r2, r4, d5 +; CHECK-NEXT: vmov r5, r8, d6 +; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: adds.w r0, r0, lr ; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 @@ -795,50 +784,42 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q6, [r0, #80] -; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vldrw.u32 q4, [r0, #64] ; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s7, s13 -; CHECK-NEXT: vmov.f32 s11, s15 -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.f64 d10, d7 -; CHECK-NEXT: vmov lr, r12, d3 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov r3, r8, d5 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vmov.f32 s22, s24 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s23, s25 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov r6, r7, d10 -; CHECK-NEXT: vmov.f32 s19, s27 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: vmov r3, r8, d7 +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmov.f32 s24, s22 +; CHECK-NEXT: vmov.f32 s25, s23 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov r6, r7, d12 ; CHECK-NEXT: adds.w r0, r5, lr ; CHECK-NEXT: adc.w r5, r4, r12 ; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r4, r2, d6 +; CHECK-NEXT: vmov r4, r2, d10 ; CHECK-NEXT: adc.w r12, r5, r8 ; CHECK-NEXT: vmov r5, r0, d8 ; CHECK-NEXT: adds r6, r6, r4 ; CHECK-NEXT: adcs r2, r7 ; CHECK-NEXT: adds r6, r6, r5 ; CHECK-NEXT: adc.w r8, r2, r0 -; CHECK-NEXT: vmov r7, r4, d11 -; CHECK-NEXT: vmov r2, r5, d7 +; CHECK-NEXT: vmov r7, r4, d1 +; CHECK-NEXT: vmov r2, r5, d9 ; CHECK-NEXT: vmov r3, r0, d0 ; CHECK-NEXT: adds r2, r2, r7 ; CHECK-NEXT: adc.w r7, r5, r4 -; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: vmov r5, r4, d7 ; CHECK-NEXT: adds r2, r2, r5 ; CHECK-NEXT: adcs r7, r4 ; CHECK-NEXT: vmov r5, r4, d2 @@ -853,7 +834,7 @@ ; CHECK-NEXT: adcs r0, r5 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <12 x i64>, <12 x i64>* %src, align 4 @@ -874,7 +855,7 @@ ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldr s1, [r0, #16] ; CHECK-NEXT: vldr s5, [r0, #20] -; CHECK-NEXT: vmov.f64 d6, d4 +; CHECK-NEXT: vmov.f32 s12, s8 ; CHECK-NEXT: vmov.f32 s13, s11 ; CHECK-NEXT: vmov.f32 s0, s9 ; CHECK-NEXT: vadd.f32 q0, q3, q0 @@ -898,20 +879,20 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vmov.f32 s3, s19 ; CHECK-NEXT: vadd.f32 q0, q2, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -933,37 +914,37 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vadd.f32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f64 d10, d4 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 ; CHECK-NEXT: vmov.f32 s23, s13 ; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s6, s12 ; CHECK-NEXT: vmov.f32 s7, s15 ; CHECK-NEXT: vadd.f32 q1, q4, q1 ; CHECK-NEXT: vstrw.32 q1, [r1] @@ -985,71 +966,71 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #80] ; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vadd.f32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f64 d10, d4 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s21, s11 ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 ; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160] ; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vadd.f32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s7, s15 ; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vadd.f32 q1, q4, q1 -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s21, s8 ; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vmov.f32 s21, s8 ; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vmov.f32 s23, s26 ; CHECK-NEXT: vmov.f32 s19, s25 ; CHECK-NEXT: vadd.f32 q4, q4, q5 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vmov.f32 s11, s27 +; CHECK-NEXT: vldrw.u32 q5, [r0, #128] ; CHECK-NEXT: vadd.f32 q2, q4, q2 ; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f64 d14, d8 -; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vmov.f32 s25, s12 -; CHECK-NEXT: vmov.f32 s29, s19 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s30, s14 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s24, s17 ; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s28, s16 +; CHECK-NEXT: vmov.f32 s29, s19 ; CHECK-NEXT: vmov.f32 s31, s21 ; CHECK-NEXT: vadd.f32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s14, s20 ; CHECK-NEXT: vmov.f32 s15, s23 ; CHECK-NEXT: vadd.f32 q3, q6, q3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] @@ -1079,9 +1060,9 @@ ; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vmovx.f16 s4, s2 ; CHECK-NEXT: vins.f16 s8, s2 -; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vmovx.f16 s2, s1 ; CHECK-NEXT: vins.f16 s1, s4 -; CHECK-NEXT: vins.f16 s0, s6 +; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vadd.f16 q1, q0, q2 ; CHECK-NEXT: vmov.f32 s0, s1 ; CHECK-NEXT: vadd.f16 q0, q1, q0 @@ -1102,32 +1083,29 @@ define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) { ; CHECK-LABEL: vld3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: ldrd r2, r3, [r0, #16] -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmovx.f16 s4, s9 -; CHECK-NEXT: vins.f16 s1, s4 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmovx.f16 s8, s8 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vins.f16 s12, s6 -; CHECK-NEXT: vins.f16 s4, s16 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vins.f16 s5, s16 +; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmovx.f16 s13, s7 -; CHECK-NEXT: vins.f16 s7, s8 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vins.f16 s7, s0 ; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vins.f16 s12, s6 ; CHECK-NEXT: vins.f16 s13, s9 ; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vadd.f16 q1, q1, q3 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <12 x half>, <12 x half>* %src, align 4 @@ -1143,49 +1121,47 @@ define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s17 -; CHECK-NEXT: vmov.f32 s5, s16 -; CHECK-NEXT: vmovx.f16 s20, s15 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vmov.f32 s11, s14 -; CHECK-NEXT: vins.f16 s11, s20 -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmovx.f16 s20, s12 -; CHECK-NEXT: vmov.f32 s28, s18 -; CHECK-NEXT: vins.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s20, s19 -; CHECK-NEXT: vins.f16 s28, s20 -; CHECK-NEXT: vmovx.f16 s24, s1 -; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s0, s24 -; CHECK-NEXT: vins.f16 s20, s2 -; CHECK-NEXT: vmovx.f16 s26, s16 -; CHECK-NEXT: vmovx.f16 s21, s3 -; CHECK-NEXT: vins.f16 s3, s26 -; CHECK-NEXT: vins.f16 s21, s17 -; CHECK-NEXT: vmovx.f16 s30, s14 -; CHECK-NEXT: vmovx.f16 s23, s13 -; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vmovx.f16 s6, s2 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmovx.f16 s6, s11 +; CHECK-NEXT: vins.f16 s7, s6 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmovx.f16 s16, s8 +; CHECK-NEXT: vmovx.f16 s12, s12 +; CHECK-NEXT: vmovx.f16 s15, s15 +; CHECK-NEXT: vmov.f32 s20, s14 +; CHECK-NEXT: vmovx.f16 s10, s10 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vmovx.f16 s18, s14 +; CHECK-NEXT: vins.f16 s6, s16 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vmovx.f16 s1, s1 +; CHECK-NEXT: vins.f16 s20, s15 +; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vins.f16 s9, s10 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s16, s2 ; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s13, s30 -; CHECK-NEXT: vins.f16 s23, s15 -; CHECK-NEXT: vmov.f32 s2, s28 -; CHECK-NEXT: vmovx.f16 s22, s18 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vins.f16 s22, s12 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vadd.f16 q0, q0, q5 +; CHECK-NEXT: vins.f16 s17, s13 +; CHECK-NEXT: vins.f16 s19, s11 +; CHECK-NEXT: vins.f16 s18, s8 +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x half>, <24 x half>* %src, align 4 @@ -1201,89 +1177,85 @@ define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld3_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s13 +; CHECK-NEXT: vmovx.f16 s6, s2 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vmovx.f16 s24, s1 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmovx.f16 s6, s11 +; CHECK-NEXT: vins.f16 s7, s6 ; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmovx.f16 s26, s12 -; CHECK-NEXT: vmovx.f16 s20, s11 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vins.f16 s19, s20 -; CHECK-NEXT: vmovx.f16 s20, s8 -; CHECK-NEXT: vins.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s20, s15 -; CHECK-NEXT: vmov.f32 s28, s14 -; CHECK-NEXT: vmovx.f16 s30, s10 -; CHECK-NEXT: vins.f16 s28, s20 -; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s0, s24 -; CHECK-NEXT: vins.f16 s20, s2 -; CHECK-NEXT: vmovx.f16 s21, s3 -; CHECK-NEXT: vins.f16 s3, s26 -; CHECK-NEXT: vins.f16 s21, s13 -; CHECK-NEXT: vmov.f32 s18, s8 -; CHECK-NEXT: vmovx.f16 s23, s9 +; CHECK-NEXT: vmovx.f16 s16, s8 +; CHECK-NEXT: vmovx.f16 s12, s12 +; CHECK-NEXT: vmovx.f16 s15, s15 +; CHECK-NEXT: vmov.f32 s20, s14 +; CHECK-NEXT: vmovx.f16 s10, s10 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vmovx.f16 s18, s14 +; CHECK-NEXT: vins.f16 s6, s16 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vmovx.f16 s1, s1 +; CHECK-NEXT: vins.f16 s20, s15 +; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vins.f16 s9, s10 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vins.f16 s17, s13 ; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s9, s30 -; CHECK-NEXT: vins.f16 s23, s11 -; CHECK-NEXT: vmovx.f16 s22, s14 -; CHECK-NEXT: vmov.f32 s2, s28 -; CHECK-NEXT: vins.f16 s22, s8 +; CHECK-NEXT: vins.f16 s19, s11 +; CHECK-NEXT: vins.f16 s18, s8 ; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s7, s19 -; CHECK-NEXT: vadd.f16 q0, q0, q5 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vmov.f32 s20, s14 ; CHECK-NEXT: vadd.f16 q1, q0, q1 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmovx.f16 s20, s11 -; CHECK-NEXT: vins.f16 s4, s16 -; CHECK-NEXT: vmovx.f16 s16, s13 ; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vmovx.f16 s24, s1 -; CHECK-NEXT: vins.f16 s5, s16 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vmovx.f16 s6, s2 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmovx.f16 s6, s11 +; CHECK-NEXT: vmovx.f16 s16, s8 +; CHECK-NEXT: vins.f16 s7, s6 ; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmovx.f16 s20, s8 -; CHECK-NEXT: vmov.f32 s28, s14 -; CHECK-NEXT: vins.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s20, s15 -; CHECK-NEXT: vins.f16 s28, s20 -; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s0, s24 -; CHECK-NEXT: vins.f16 s20, s2 -; CHECK-NEXT: vmovx.f16 s21, s3 -; CHECK-NEXT: vmovx.f16 s26, s12 -; CHECK-NEXT: vins.f16 s21, s13 -; CHECK-NEXT: vins.f16 s3, s26 -; CHECK-NEXT: vmovx.f16 s30, s10 -; CHECK-NEXT: vmovx.f16 s23, s9 -; CHECK-NEXT: vmov.f32 s18, s8 -; CHECK-NEXT: vins.f16 s9, s30 -; CHECK-NEXT: vins.f16 s23, s11 +; CHECK-NEXT: vmovx.f16 s15, s15 +; CHECK-NEXT: vmovx.f16 s12, s12 +; CHECK-NEXT: vmovx.f16 s10, s10 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vmovx.f16 s18, s14 +; CHECK-NEXT: vins.f16 s6, s16 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vmovx.f16 s1, s1 +; CHECK-NEXT: vins.f16 s20, s15 +; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vins.f16 s9, s10 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s16, s2 ; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmovx.f16 s22, s14 -; CHECK-NEXT: vmov.f32 s2, s28 -; CHECK-NEXT: vins.f16 s22, s8 +; CHECK-NEXT: vins.f16 s17, s13 +; CHECK-NEXT: vins.f16 s19, s11 +; CHECK-NEXT: vins.f16 s18, s8 +; CHECK-NEXT: vmov.f32 s2, s20 ; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s7, s19 -; CHECK-NEXT: vadd.f16 q0, q0, q5 +; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: bx lr entry: %l1 = load <48 x half>, <48 x half>* %src, align 4 @@ -1303,11 +1275,11 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vadd.f64 d4, d3, d0 -; CHECK-NEXT: vadd.f64 d5, d6, d7 -; CHECK-NEXT: vadd.f64 d1, d4, d1 -; CHECK-NEXT: vadd.f64 d0, d5, d2 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vadd.f64 d0, d3, d0 +; CHECK-NEXT: vadd.f64 d3, d4, d5 +; CHECK-NEXT: vadd.f64 d1, d0, d1 +; CHECK-NEXT: vadd.f64 d0, d3, d2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1324,25 +1296,25 @@ define void @vld3_v4f64(<12 x double> *%src, <4 x double> *%dst) { ; CHECK-LABEL: vld3_v4f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q1, [r0, #80] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vadd.f64 d5, d6, d7 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vadd.f64 d4, d1, d2 -; CHECK-NEXT: vadd.f64 d10, d9, d6 -; CHECK-NEXT: vadd.f64 d11, d12, d13 -; CHECK-NEXT: vadd.f64 d3, d4, d3 -; CHECK-NEXT: vadd.f64 d2, d5, d0 -; CHECK-NEXT: vadd.f64 d1, d10, d7 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vadd.f64 d0, d11, d8 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vadd.f64 d1, d1, d2 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vadd.f64 d2, d4, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vadd.f64 d4, d7, d4 +; CHECK-NEXT: vadd.f64 d7, d8, d9 +; CHECK-NEXT: vadd.f64 d1, d1, d3 +; CHECK-NEXT: vadd.f64 d0, d2, d0 +; CHECK-NEXT: vadd.f64 d3, d4, d5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vadd.f64 d2, d7, d6 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <12 x double>, <12 x double>* %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll @@ -6,18 +6,14 @@ define <16 x i32> *@vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vld4_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q4, q2, q3 +; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x i32>, <16 x i32>* %src, align 4 @@ -38,18 +34,14 @@ define <32 x i16> *@vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld4_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i16 q4, q2, q3 +; CHECK-NEXT: vadd.i16 q2, q2, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q4 +; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i16>, <32 x i16>* %src, align 4 @@ -70,18 +62,14 @@ define <64 x i8> *@vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vld4_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i8 q4, q2, q3 +; CHECK-NEXT: vadd.i8 q2, q2, q3 ; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vadd.i8 q0, q0, q4 +; CHECK-NEXT: vadd.i8 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i8>, <64 x i8>* %src, align 4 @@ -109,23 +97,19 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 ; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov lr, r12, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0], #64 -; CHECK-NEXT: vmov.f64 d6, d5 +; CHECK-NEXT: vmov r4, r8, d9 +; CHECK-NEXT: vmov.f32 s12, s10 ; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s10, s16 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vmov lr, r12, d3 ; CHECK-NEXT: vmov r2, r7, d1 -; CHECK-NEXT: vmov r4, r8, d7 -; CHECK-NEXT: vmov r3, r6, d5 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vmov r3, r6, d1 ; CHECK-NEXT: adds.w r2, r2, lr ; CHECK-NEXT: adc.w r7, r7, r12 ; CHECK-NEXT: adds r3, r3, r4 @@ -166,18 +150,14 @@ define <16 x float> *@vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) { ; CHECK-LABEL: vld4_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q4, q2, q3 +; CHECK-NEXT: vadd.f32 q2, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vadd.f32 q0, q0, q4 +; CHECK-NEXT: vadd.f32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x float>, <16 x float>* %src, align 4 @@ -198,18 +178,14 @@ define <32 x half> *@vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld4_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f16 q4, q2, q3 +; CHECK-NEXT: vadd.f16 q2, q2, q3 ; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x half>, <32 x half>* %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -6,17 +6,17 @@ define void @vld4_v2i32(<8 x i32> *%src, <2 x i32> *%dst) { ; CHECK-LABEL: vld4_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s8, s3 +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.f32 s10, s7 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s8, s3 +; CHECK-NEXT: vmov.f32 s12, s1 ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: add.w r12, r2, r0 @@ -44,18 +44,14 @@ define void @vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vld4_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q4, q2, q3 +; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x i32>, <16 x i32>* %src, align 4 @@ -79,7 +75,6 @@ ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.i32 q4, q2, q3 ; CHECK-NEXT: vadd.i32 q5, q0, q1 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] @@ -88,10 +83,9 @@ ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q5, q2, q3 +; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -111,12 +105,10 @@ define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) { ; CHECK-LABEL: vld4_v16i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5} -; CHECK-NEXT: push {r4, r5} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #112 -; CHECK-NEXT: sub sp, #112 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: add.w r3, r0, #192 @@ -124,52 +116,40 @@ ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q4, q2, q3 -; CHECK-NEXT: vadd.i32 q6, q0, q1 -; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q2, q6, q2 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q2, q3, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vadd.i32 q1, q3, q1 -; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 +; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r3] ; CHECK-NEXT: vadd.i32 q1, q5, q6 ; CHECK-NEXT: vadd.i32 q2, q3, q4 +; CHECK-NEXT: vadd.i32 q0, q2, q1 +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q1, q6, q7 +; CHECK-NEXT: vadd.i32 q2, q4, q5 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #112 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5} +; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i32>, <64 x i32>* %src, align 4 @@ -189,25 +169,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.u8 q2, [r0] -; CHECK-NEXT: vldrb.u8 q3, [r0, #16] -; CHECK-NEXT: vldrb.u8 q1, [r0, #32] ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] +; CHECK-NEXT: vldrb.u8 q1, [r0, #32] +; CHECK-NEXT: vldrb.u8 q3, [r0, #16] +; CHECK-NEXT: vldrb.u8 q2, [r0] +; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s16, s11 -; CHECK-NEXT: vmov.f64 d10, d5 +; CHECK-NEXT: vmov.f32 s20, s10 ; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.f32 s21, s14 -; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.f32 s23, s2 ; CHECK-NEXT: vadd.i32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s20, s9 ; CHECK-NEXT: vmov.f32 s21, s13 -; CHECK-NEXT: vmov.f32 s9, s12 ; CHECK-NEXT: vmov.f32 s22, s5 -; CHECK-NEXT: vmov.f32 s10, s4 ; CHECK-NEXT: vmov.f32 s23, s1 +; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vmov.f32 s10, s4 ; CHECK-NEXT: vmov.f32 s11, s0 ; CHECK-NEXT: vadd.i32 q0, q2, q5 ; CHECK-NEXT: vadd.i32 q0, q0, q4 @@ -316,18 +296,14 @@ define void @vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld4_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i16 q4, q2, q3 +; CHECK-NEXT: vadd.i16 q2, q2, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q4 +; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i16>, <32 x i16>* %src, align 2 @@ -351,7 +327,6 @@ ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.i16 q4, q2, q3 ; CHECK-NEXT: vadd.i16 q5, q0, q1 ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] @@ -360,10 +335,9 @@ ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i16 q5, q2, q3 +; CHECK-NEXT: vadd.i16 q2, q2, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q5 +; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -383,58 +357,56 @@ define void @vld4_v8i16_align1(<32 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld4_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrb.u8 q1, [r0, #32] ; CHECK-NEXT: vldrb.u8 q2, [r0, #48] -; CHECK-NEXT: vmovx.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s18, s5 +; CHECK-NEXT: vmovx.f16 s0, s7 ; CHECK-NEXT: vins.f16 s18, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 ; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vmovx.f16 s0, s11 ; CHECK-NEXT: vins.f16 s5, s7 ; CHECK-NEXT: vins.f16 s19, s0 ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vins.f16 s9, s11 -; CHECK-NEXT: vmovx.f16 s24, s6 -; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vmov.f32 s22, s5 ; CHECK-NEXT: vmovx.f16 s16, s1 +; CHECK-NEXT: vmovx.f16 s12, s3 ; CHECK-NEXT: vins.f16 s16, s12 ; CHECK-NEXT: vldrb.u8 q3, [r0, #16] ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vmovx.f16 s20, s15 -; CHECK-NEXT: vmovx.f16 s17, s13 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmov.f32 s22, s5 ; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vmovx.f16 s17, s13 +; CHECK-NEXT: vmovx.f16 s20, s15 ; CHECK-NEXT: vins.f16 s13, s15 +; CHECK-NEXT: vins.f16 s17, s20 ; CHECK-NEXT: vmov.f32 s20, s1 +; CHECK-NEXT: vmovx.f16 s1, s6 ; CHECK-NEXT: vmov.f32 s21, s13 ; CHECK-NEXT: vadd.i16 q4, q5, q4 ; CHECK-NEXT: vmovx.f16 s22, s4 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s24, s10 +; CHECK-NEXT: vins.f16 s22, s1 ; CHECK-NEXT: vmovx.f16 s23, s8 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmov.f32 s6, s4 -; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vins.f16 s23, s24 -; CHECK-NEXT: vmovx.f16 s24, s2 +; CHECK-NEXT: vmovx.f16 s1, s10 ; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s14 +; CHECK-NEXT: vins.f16 s23, s1 +; CHECK-NEXT: vmovx.f16 s1, s2 +; CHECK-NEXT: vins.f16 s20, s1 ; CHECK-NEXT: vmovx.f16 s21, s12 -; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s1, s14 +; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vins.f16 s21, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmov.f32 s3, s8 ; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vadd.i16 q0, q0, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i16>, <32 x i16>* %src, align 1 @@ -608,18 +580,14 @@ define void @vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vld4_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i8 q4, q2, q3 +; CHECK-NEXT: vadd.i8 q2, q2, q3 ; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vadd.i8 q0, q0, q4 +; CHECK-NEXT: vadd.i8 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i8>, <64 x i8>* %src, align 1 @@ -646,23 +614,19 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 ; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov lr, r12, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f64 d6, d5 +; CHECK-NEXT: vmov r0, r8, d9 +; CHECK-NEXT: vmov.f32 s12, s10 ; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s10, s16 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vmov lr, r12, d3 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov r0, r8, d7 -; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vmov r5, r6, d1 ; CHECK-NEXT: adds.w r2, r2, lr ; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: vmov r4, r12, d2 @@ -706,54 +670,45 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d6, d3 -; CHECK-NEXT: vldrw.u32 q6, [r0, #112] +; CHECK-NEXT: vldrw.u32 q6, [r0, #80] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vldrw.u32 q7, [r0, #112] +; CHECK-NEXT: vmov.f32 s3, s21 +; CHECK-NEXT: vmov r3, r2, d11 +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vmov.f32 s0, s26 +; CHECK-NEXT: vmov.f32 s1, s27 +; CHECK-NEXT: vmov lr, r12, d9 +; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #96] -; CHECK-NEXT: vmov.f64 d4, d11 -; CHECK-NEXT: vmov.f32 s9, s23 -; CHECK-NEXT: vmov r3, r2, d7 -; CHECK-NEXT: vmov r4, r5, d3 -; CHECK-NEXT: vmov.f32 s10, s18 -; CHECK-NEXT: vmov.f32 s11, s19 -; CHECK-NEXT: vmov.f32 s22, s16 -; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s3, s17 ; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmov q7, q5 -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vmov r0, r6, d15 -; CHECK-NEXT: vmov.f64 d14, d11 -; CHECK-NEXT: vmov.f32 s29, s23 -; CHECK-NEXT: vmov lr, r12, d5 -; CHECK-NEXT: vmov.f32 s30, s26 -; CHECK-NEXT: vmov.f32 s22, s24 -; CHECK-NEXT: vmov.f32 s31, s27 -; CHECK-NEXT: vmov.f32 s23, s25 -; CHECK-NEXT: vmov.f64 d12, d9 +; CHECK-NEXT: vmov.f32 s6, s28 +; CHECK-NEXT: vmov.f32 s7, s29 +; CHECK-NEXT: vmov.f32 s10, s20 +; CHECK-NEXT: vmov.f32 s11, s21 +; CHECK-NEXT: vmov r0, r6, d1 ; CHECK-NEXT: adds r7, r4, r3 +; CHECK-NEXT: vmov r4, r8, d0 ; CHECK-NEXT: adcs r5, r2 -; CHECK-NEXT: vmov r4, r8, d14 -; CHECK-NEXT: vmov r2, r3, d10 -; CHECK-NEXT: vmov.f32 s25, s19 -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vmov.f32 s18, s0 -; CHECK-NEXT: vmov.f32 s27, s3 -; CHECK-NEXT: vmov.f32 s19, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov.f32 s0, s18 +; CHECK-NEXT: vmov.f32 s1, s19 ; CHECK-NEXT: adds.w r0, r0, lr ; CHECK-NEXT: adc.w r6, r6, r12 ; CHECK-NEXT: adds.w lr, r0, r7 ; CHECK-NEXT: adc.w r12, r6, r5 -; CHECK-NEXT: vmov r6, r5, d12 +; CHECK-NEXT: vmov r6, r5, d0 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: adds r2, r2, r4 ; CHECK-NEXT: vmov r4, r0, d8 ; CHECK-NEXT: adc.w r3, r3, r8 @@ -762,11 +717,11 @@ ; CHECK-NEXT: adds.w r9, r6, r2 ; CHECK-NEXT: adc.w r8, r0, r3 ; CHECK-NEXT: vmov r5, r4, d15 -; CHECK-NEXT: vmov r3, r6, d11 -; CHECK-NEXT: vmov r7, r0, d9 +; CHECK-NEXT: vmov r3, r6, d3 +; CHECK-NEXT: vmov r7, r0, d5 ; CHECK-NEXT: adds r3, r3, r5 ; CHECK-NEXT: adcs r6, r4 -; CHECK-NEXT: vmov r5, r4, d13 +; CHECK-NEXT: vmov r5, r4, d11 ; CHECK-NEXT: adds r5, r5, r7 ; CHECK-NEXT: adcs r0, r4 ; CHECK-NEXT: adds r3, r3, r5 @@ -808,11 +763,11 @@ define void @vld4_v2f32(<8 x float> *%src, <2 x float> *%dst) { ; CHECK-LABEL: vld4_v2f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov.f32 s8, s7 -; CHECK-NEXT: vmov.f64 d6, d3 ; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.f32 s13, s2 ; CHECK-NEXT: vadd.f32 q2, q3, q2 ; CHECK-NEXT: vmov.f32 s12, s5 @@ -838,18 +793,14 @@ define void @vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) { ; CHECK-LABEL: vld4_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q4, q2, q3 +; CHECK-NEXT: vadd.f32 q2, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vadd.f32 q0, q0, q4 +; CHECK-NEXT: vadd.f32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x float>, <16 x float>* %src, align 4 @@ -873,7 +824,6 @@ ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.f32 q4, q2, q3 ; CHECK-NEXT: vadd.f32 q5, q0, q1 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] @@ -882,10 +832,9 @@ ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q5, q2, q3 +; CHECK-NEXT: vadd.f32 q2, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vadd.f32 q0, q0, q5 +; CHECK-NEXT: vadd.f32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -905,12 +854,10 @@ define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) { ; CHECK-LABEL: vld4_v16f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5} -; CHECK-NEXT: push {r4, r5} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #112 -; CHECK-NEXT: sub sp, #112 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: add.w r3, r0, #192 @@ -918,52 +865,40 @@ ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q4, q2, q3 -; CHECK-NEXT: vadd.f32 q6, q0, q1 -; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 q0, q1, q0 -; CHECK-NEXT: vadd.f32 q2, q6, q2 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vadd.f32 q2, q3, q4 -; CHECK-NEXT: vadd.f32 q0, q0, q2 -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 q0, q0, q2 -; CHECK-NEXT: vadd.f32 q1, q3, q1 -; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload +; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r3] ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 +; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vadd.f32 q0, q0, q2 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r3] ; CHECK-NEXT: vadd.f32 q1, q5, q6 ; CHECK-NEXT: vadd.f32 q2, q3, q4 +; CHECK-NEXT: vadd.f32 q0, q2, q1 +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vadd.f32 q0, q0, q2 +; CHECK-NEXT: vadd.f32 q1, q6, q7 +; CHECK-NEXT: vadd.f32 q2, q4, q5 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vadd.f32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #112 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5} +; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x float>, <64 x float>* %src, align 4 @@ -983,25 +918,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.u8 q2, [r0] -; CHECK-NEXT: vldrb.u8 q3, [r0, #16] -; CHECK-NEXT: vldrb.u8 q1, [r0, #32] ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] +; CHECK-NEXT: vldrb.u8 q1, [r0, #32] +; CHECK-NEXT: vldrb.u8 q3, [r0, #16] +; CHECK-NEXT: vldrb.u8 q2, [r0] +; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s16, s11 -; CHECK-NEXT: vmov.f64 d10, d5 +; CHECK-NEXT: vmov.f32 s20, s10 ; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.f32 s21, s14 -; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.f32 s23, s2 ; CHECK-NEXT: vadd.f32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s20, s9 ; CHECK-NEXT: vmov.f32 s21, s13 -; CHECK-NEXT: vmov.f32 s9, s12 ; CHECK-NEXT: vmov.f32 s22, s5 -; CHECK-NEXT: vmov.f32 s10, s4 ; CHECK-NEXT: vmov.f32 s23, s1 +; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vmov.f32 s10, s4 ; CHECK-NEXT: vmov.f32 s11, s0 ; CHECK-NEXT: vadd.f32 q0, q2, q5 ; CHECK-NEXT: vadd.f32 q0, q0, q4 @@ -1027,17 +962,17 @@ ; CHECK-LABEL: vld4_v2f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vmovx.f16 s4, s2 ; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vins.f16 s12, s4 -; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vadd.f16 q0, q0, q3 +; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vadd.f16 q1, q1, q2 +; CHECK-NEXT: vadd.f16 q0, q0, q3 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] @@ -1058,27 +993,27 @@ define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) { ; CHECK-LABEL: vld4_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vins.f16 s4, s8 ; CHECK-NEXT: vldrh.u16 q2, [r0, #16] +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s6, s2 ; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s16, s3 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vmovx.f16 s5, s8 -; CHECK-NEXT: vins.f16 s5, s12 ; CHECK-NEXT: vmovx.f16 s12, s1 -; CHECK-NEXT: vins.f16 s12, s16 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s16, s11 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s5, s8 +; CHECK-NEXT: vmovx.f16 s6, s10 +; CHECK-NEXT: vins.f16 s12, s2 ; CHECK-NEXT: vmovx.f16 s13, s9 +; CHECK-NEXT: vmovx.f16 s2, s11 ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vins.f16 s13, s16 ; CHECK-NEXT: vins.f16 s9, s11 +; CHECK-NEXT: vins.f16 s8, s10 ; CHECK-NEXT: vmov.f32 s16, s1 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vins.f16 s13, s2 ; CHECK-NEXT: vmov.f32 s1, s8 ; CHECK-NEXT: vmov.f32 s17, s9 ; CHECK-NEXT: vadd.f16 q0, q0, q1 @@ -1086,7 +1021,7 @@ ; CHECK-NEXT: vadd.f16 q0, q0, q3 ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x half>, <16 x half>* %src, align 2 @@ -1104,18 +1039,14 @@ define void @vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld4_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f16 q4, q2, q3 +; CHECK-NEXT: vadd.f16 q2, q2, q3 ; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x half>, <32 x half>* %src, align 2 @@ -1133,37 +1064,25 @@ define void @vld4_v16f16(<64 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld4_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5} -; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #80 -; CHECK-NEXT: sub sp, #80 ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: vld40.16 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vadd.f16 q2, q2, q3 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vld41.16 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vld42.16 {q4, q5, q6, q7}, [r0] ; CHECK-NEXT: vld43.16 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: @ kill: def $q4 killed $q4 killed $q4_q5_q6_q7 -; CHECK-NEXT: vadd.f16 q0, q6, q7 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vadd.f16 q6, q6, q7 ; CHECK-NEXT: vadd.f16 q4, q4, q5 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.f16 q4, q4, q0 -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.f16 q4, q4, q6 ; CHECK-NEXT: vstrw.32 q4, [r1, #16] -; CHECK-NEXT: vadd.f16 q4, q2, q3 -; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vadd.f16 q0, q0, q4 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #80 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x half>, <64 x half>* %src, align 2 @@ -1185,48 +1104,48 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vldrb.u8 q2, [r0, #48] -; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vmovx.f16 s18, s1 +; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vins.f16 s18, s4 -; CHECK-NEXT: vmovx.f16 s4, s11 ; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vmovx.f16 s4, s11 ; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vins.f16 s19, s4 ; CHECK-NEXT: vldrb.u8 q1, [r0] -; CHECK-NEXT: vmovx.f16 s24, s2 -; CHECK-NEXT: vins.f16 s9, s11 -; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmovx.f16 s22, s0 +; CHECK-NEXT: vmovx.f16 s3, s2 ; CHECK-NEXT: vmovx.f16 s16, s5 +; CHECK-NEXT: vmovx.f16 s12, s7 ; CHECK-NEXT: vins.f16 s16, s12 ; CHECK-NEXT: vldrb.u8 q3, [r0, #16] -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vins.f16 s22, s3 +; CHECK-NEXT: vmovx.f16 s23, s8 ; CHECK-NEXT: vmovx.f16 s17, s13 +; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmovx.f16 s3, s10 ; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmovx.f16 s22, s0 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vmovx.f16 s24, s10 -; CHECK-NEXT: vmovx.f16 s23, s8 -; CHECK-NEXT: vins.f16 s13, s15 -; CHECK-NEXT: vins.f16 s23, s24 -; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vins.f16 s23, s3 ; CHECK-NEXT: vmovx.f16 s20, s4 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s14 +; CHECK-NEXT: vmovx.f16 s3, s6 +; CHECK-NEXT: vins.f16 s9, s11 +; CHECK-NEXT: vins.f16 s5, s7 +; CHECK-NEXT: vins.f16 s13, s15 +; CHECK-NEXT: vins.f16 s20, s3 ; CHECK-NEXT: vmovx.f16 s21, s12 +; CHECK-NEXT: vmovx.f16 s3, s14 ; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vins.f16 s21, s24 -; CHECK-NEXT: vmov.f32 s26, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s12, s14 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmov.f32 s27, s9 ; CHECK-NEXT: vmov.f32 s24, s5 -; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vins.f16 s21, s3 +; CHECK-NEXT: vmov.f32 s26, s1 +; CHECK-NEXT: vmov.f32 s27, s9 +; CHECK-NEXT: vmov.f32 s25, s13 ; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vadd.f16 q4, q6, q4 ; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vmov.f32 s25, s13 ; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vadd.f16 q4, q6, q4 ; CHECK-NEXT: vadd.f16 q0, q1, q5 ; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll @@ -30,9 +30,9 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] ; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! -; CHECK-NEXT: vmul.f16 q2, q0, q0 -; CHECK-NEXT: vfma.f16 q2, q1, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vmul.f16 q0, q0, q0 +; CHECK-NEXT: vfma.f16 q0, q1, q1 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB0_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r4, r2 @@ -159,9 +159,9 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] ; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! -; CHECK-NEXT: vmul.f32 q2, q0, q0 -; CHECK-NEXT: vfma.f32 q2, q1, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vmul.f32 q0, q0, q0 +; CHECK-NEXT: vfma.f32 q0, q1, q1 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r4, r2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -6,125 +6,119 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: mul r12, r3, r2 ; CHECK-NEXT: lsrs.w r2, r12, #2 ; CHECK-NEXT: beq.w .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 -; CHECK-NEXT: ldr r2, [sp, #88] +; CHECK-NEXT: ldr r2, [sp, #56] ; CHECK-NEXT: and.w r3, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q4, [r0, #32] -; CHECK-NEXT: vldrh.u16 q5, [r0, #48] +; CHECK-NEXT: vldrh.u16 q1, [r0, #32] +; CHECK-NEXT: vldrh.u16 q4, [r0, #48] ; CHECK-NEXT: vldrh.u16 q3, [r0], #64 -; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmovx.f16 s26, s4 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vldrh.u16 q5, [r0, #-48] +; CHECK-NEXT: vmovx.f16 s27, s16 +; CHECK-NEXT: vins.f16 s26, s6 +; CHECK-NEXT: vmovx.f16 s6, s18 +; CHECK-NEXT: vmovx.f16 s8, s7 +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vmovx.f16 s24, s12 +; CHECK-NEXT: vins.f16 s10, s8 +; CHECK-NEXT: vins.f16 s27, s6 +; CHECK-NEXT: vmovx.f16 s6, s14 ; CHECK-NEXT: vmovx.f16 s8, s19 -; CHECK-NEXT: vldrh.u16 q6, [r0, #-48] -; CHECK-NEXT: vins.f16 s2, s19 -; CHECK-NEXT: vmov.f32 s3, s21 -; CHECK-NEXT: vmovx.f16 s5, s25 -; CHECK-NEXT: vins.f16 s3, s23 -; CHECK-NEXT: vmovx.f16 s6, s17 +; CHECK-NEXT: vmovx.f16 s11, s17 ; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmovx.f16 s8, s23 -; CHECK-NEXT: vmovx.f16 s7, s21 -; CHECK-NEXT: vins.f16 s0, s15 -; CHECK-NEXT: vins.f16 s7, s8 -; CHECK-NEXT: vmovx.f16 s8, s15 -; CHECK-NEXT: vmovx.f16 s4, s13 -; CHECK-NEXT: vins.f16 s25, s27 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s27 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vmov.f32 s1, s25 -; CHECK-NEXT: vmul.f16 q2, q1, r2 -; CHECK-NEXT: vmul.f16 q0, q0, r2 -; CHECK-NEXT: vmovx.f16 s7, s0 -; CHECK-NEXT: vmovx.f16 s28, s8 -; CHECK-NEXT: vins.f16 s7, s28 -; CHECK-NEXT: vmovx.f16 s30, s16 -; CHECK-NEXT: vmovx.f16 s31, s20 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s28, s12 -; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vmovx.f16 s29, s24 -; CHECK-NEXT: vmovx.f16 s4, s18 +; CHECK-NEXT: vins.f16 s11, s8 +; CHECK-NEXT: vmovx.f16 s25, s20 +; CHECK-NEXT: vins.f16 s24, s6 +; CHECK-NEXT: vmovx.f16 s6, s22 +; CHECK-NEXT: vmovx.f16 s1, s15 +; CHECK-NEXT: vmovx.f16 s8, s13 ; CHECK-NEXT: vins.f16 s20, s22 -; CHECK-NEXT: vins.f16 s30, s4 -; CHECK-NEXT: vmovx.f16 s4, s22 +; CHECK-NEXT: vins.f16 s16, s18 +; CHECK-NEXT: vins.f16 s25, s6 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vins.f16 s0, s15 +; CHECK-NEXT: vmovx.f16 s9, s21 +; CHECK-NEXT: vins.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s1, s23 ; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vins.f16 s31, s4 -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov.f32 s14, s16 -; CHECK-NEXT: vins.f16 s24, s26 -; CHECK-NEXT: vmov.f32 s15, s20 -; CHECK-NEXT: vins.f16 s28, s4 -; CHECK-NEXT: vmovx.f16 s4, s26 -; CHECK-NEXT: vmov.f32 s13, s24 -; CHECK-NEXT: vins.f16 s29, s4 +; CHECK-NEXT: vins.f16 s21, s23 +; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vmov.f32 s15, s16 +; CHECK-NEXT: vins.f16 s9, s1 +; CHECK-NEXT: vmov.f32 s13, s20 +; CHECK-NEXT: vmul.f16 q6, q6, r2 ; CHECK-NEXT: vmul.f16 q3, q3, r2 -; CHECK-NEXT: vmul.f16 q7, q7, r2 +; CHECK-NEXT: vins.f16 s2, s7 +; CHECK-NEXT: vins.f16 s3, s19 +; CHECK-NEXT: vmov.f32 s1, s21 +; CHECK-NEXT: vmul.f16 q0, q0, r2 ; CHECK-NEXT: vmovx.f16 s4, s12 -; CHECK-NEXT: vmovx.f16 s6, s28 +; CHECK-NEXT: vmovx.f16 s6, s24 +; CHECK-NEXT: vmul.f16 q2, q2, r2 +; CHECK-NEXT: vmovx.f16 s7, s0 ; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vmovx.f16 s8, s8 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s5, s9 -; CHECK-NEXT: vins.f16 s12, s28 -; CHECK-NEXT: vins.f16 s6, s5 -; CHECK-NEXT: vmovx.f16 s18, s13 -; CHECK-NEXT: vmovx.f16 s5, s29 -; CHECK-NEXT: vins.f16 s1, s9 -; CHECK-NEXT: vins.f16 s18, s5 -; CHECK-NEXT: vmovx.f16 s23, s2 -; CHECK-NEXT: vmovx.f16 s5, s10 -; CHECK-NEXT: vins.f16 s2, s10 -; CHECK-NEXT: vins.f16 s23, s5 -; CHECK-NEXT: vins.f16 s13, s29 -; CHECK-NEXT: vmovx.f16 s27, s3 +; CHECK-NEXT: vmovx.f16 s5, s1 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vins.f16 s7, s8 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vmovx.f16 s8, s25 +; CHECK-NEXT: vins.f16 s6, s8 +; CHECK-NEXT: vmovx.f16 s19, s2 +; CHECK-NEXT: vmovx.f16 s8, s10 +; CHECK-NEXT: vmovx.f16 s18, s14 +; CHECK-NEXT: vins.f16 s19, s8 +; CHECK-NEXT: vmovx.f16 s8, s26 +; CHECK-NEXT: vins.f16 s18, s8 +; CHECK-NEXT: vmovx.f16 s23, s3 ; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmovx.f16 s22, s14 -; CHECK-NEXT: vins.f16 s27, s8 -; CHECK-NEXT: vins.f16 s14, s30 -; CHECK-NEXT: vmovx.f16 s26, s15 -; CHECK-NEXT: vins.f16 s15, s31 -; CHECK-NEXT: vmovx.f16 s8, s31 +; CHECK-NEXT: vins.f16 s14, s26 +; CHECK-NEXT: vins.f16 s23, s8 +; CHECK-NEXT: vmovx.f16 s22, s15 +; CHECK-NEXT: vins.f16 s15, s27 +; CHECK-NEXT: vmovx.f16 s8, s27 +; CHECK-NEXT: vins.f16 s12, s24 +; CHECK-NEXT: vins.f16 s13, s25 ; CHECK-NEXT: vins.f16 s3, s11 -; CHECK-NEXT: vins.f16 s26, s8 +; CHECK-NEXT: vins.f16 s1, s9 +; CHECK-NEXT: vins.f16 s2, s10 +; CHECK-NEXT: vins.f16 s22, s8 ; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vmovx.f16 s5, s30 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s17, s0 ; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov.f32 s29, s0 -; CHECK-NEXT: vins.f16 s22, s5 +; CHECK-NEXT: vmov q6, q0 +; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s11, s31 -; CHECK-NEXT: vmov q7, q0 -; CHECK-NEXT: vmov.f32 s31, s6 -; CHECK-NEXT: vmov.f32 s16, s13 -; CHECK-NEXT: vmov.f32 s21, s2 -; CHECK-NEXT: vmov.f32 s25, s3 -; CHECK-NEXT: vmov.f32 s17, s29 -; CHECK-NEXT: vmov.f32 s20, s14 -; CHECK-NEXT: vmov.f32 s24, s15 -; CHECK-NEXT: vstrh.16 q5, [r1, #32] -; CHECK-NEXT: vstrh.16 q6, [r1, #48] +; CHECK-NEXT: vmov.f32 s17, s2 +; CHECK-NEXT: vmov.f32 s16, s14 +; CHECK-NEXT: vmov.f32 s21, s3 +; CHECK-NEXT: vstrh.16 q4, [r1, #32] +; CHECK-NEXT: vmov.f32 s20, s15 +; CHECK-NEXT: vmov.f32 s7, s5 +; CHECK-NEXT: vstrh.16 q5, [r1, #48] ; CHECK-NEXT: vstrh.16 q2, [r1], #64 -; CHECK-NEXT: vmov.f32 s19, s31 -; CHECK-NEXT: vstrh.16 q4, [r1, #-48] +; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vmov.f32 s5, s25 +; CHECK-NEXT: vstrh.16 q1, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r7, pc} entry: %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll @@ -176,8 +176,8 @@ ; CHECK-LABEL: vmovn64_b2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -185,8 +185,8 @@ ; CHECKBE-LABEL: vmovn64_b2: ; CHECKBE: @ %bb.0: @ %entry ; CHECKBE-NEXT: vmov.f32 s4, s6 -; CHECKBE-NEXT: vmov.f32 s5, s7 ; CHECKBE-NEXT: vmov.f32 s6, s0 +; CHECKBE-NEXT: vmov.f32 s5, s7 ; CHECKBE-NEXT: vmov.f32 s7, s1 ; CHECKBE-NEXT: vmov q0, q1 ; CHECKBE-NEXT: bx lr @@ -199,16 +199,16 @@ ; CHECK-LABEL: vmovn64_b3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn64_b3: ; CHECKBE: @ %bb.0: @ %entry ; CHECKBE-NEXT: vmov.f32 s0, s2 -; CHECKBE-NEXT: vmov.f32 s1, s3 ; CHECKBE-NEXT: vmov.f32 s2, s4 +; CHECKBE-NEXT: vmov.f32 s1, s3 ; CHECKBE-NEXT: vmov.f32 s3, s5 ; CHECKBE-NEXT: bx lr entry: @@ -301,11 +301,11 @@ define arm_aapcs_vfpcc <4 x i32> @vmovn32_b2(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vmovn32_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmov.f32 s7, s2 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn32_b2: @@ -326,22 +326,21 @@ define arm_aapcs_vfpcc <4 x i32> @vmovn32_b3(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vmovn32_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn32_b3: ; CHECKBE: @ %bb.0: @ %entry ; CHECKBE-NEXT: vrev64.32 q2, q1 ; CHECKBE-NEXT: vrev64.32 q1, q0 -; CHECKBE-NEXT: vmov.f32 s12, s5 -; CHECKBE-NEXT: vmov.f32 s13, s8 -; CHECKBE-NEXT: vmov.f32 s14, s7 -; CHECKBE-NEXT: vmov.f32 s15, s10 -; CHECKBE-NEXT: vrev64.32 q0, q3 +; CHECKBE-NEXT: vmov.f32 s4, s5 +; CHECKBE-NEXT: vmov.f32 s6, s7 +; CHECKBE-NEXT: vmov.f32 s5, s8 +; CHECKBE-NEXT: vmov.f32 s7, s10 +; CHECKBE-NEXT: vrev64.32 q0, q1 ; CHECKBE-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> @@ -450,15 +449,15 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s9, s5 -; CHECK-NEXT: vins.f16 s9, s1 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s6 -; CHECK-NEXT: vins.f16 s10, s2 -; CHECK-NEXT: vmovx.f16 s11, s7 -; CHECK-NEXT: vins.f16 s11, s3 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmovx.f16 s5, s5 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vmovx.f16 s7, s7 +; CHECK-NEXT: vins.f16 s5, s1 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vins.f16 s7, s3 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn16_b2: @@ -466,12 +465,12 @@ ; CHECKBE-NEXT: vrev64.16 q2, q0 ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: vmovx.f16 s5, s1 -; CHECKBE-NEXT: vins.f16 s5, s9 ; CHECKBE-NEXT: vmovx.f16 s4, s0 -; CHECKBE-NEXT: vins.f16 s4, s8 ; CHECKBE-NEXT: vmovx.f16 s6, s2 -; CHECKBE-NEXT: vins.f16 s6, s10 ; CHECKBE-NEXT: vmovx.f16 s7, s3 +; CHECKBE-NEXT: vins.f16 s5, s9 +; CHECKBE-NEXT: vins.f16 s4, s8 +; CHECKBE-NEXT: vins.f16 s6, s10 ; CHECKBE-NEXT: vins.f16 s7, s11 ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr @@ -483,28 +482,27 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmovx.f16 s1, s9 +; CHECK-NEXT: vmovx.f16 s1, s1 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vmovx.f16 s3, s3 ; CHECK-NEXT: vins.f16 s1, s5 -; CHECK-NEXT: vmovx.f16 s0, s8 ; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmovx.f16 s2, s10 ; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s3, s11 ; CHECK-NEXT: vins.f16 s3, s7 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn16_b3: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: vrev64.16 q3, q0 ; CHECKBE-NEXT: vrev64.16 q2, q1 -; CHECKBE-NEXT: vmovx.f16 s5, s13 +; CHECKBE-NEXT: vrev64.16 q1, q0 +; CHECKBE-NEXT: vmovx.f16 s5, s5 +; CHECKBE-NEXT: vmovx.f16 s4, s4 +; CHECKBE-NEXT: vmovx.f16 s6, s6 +; CHECKBE-NEXT: vmovx.f16 s7, s7 ; CHECKBE-NEXT: vins.f16 s5, s9 -; CHECKBE-NEXT: vmovx.f16 s4, s12 ; CHECKBE-NEXT: vins.f16 s4, s8 -; CHECKBE-NEXT: vmovx.f16 s6, s14 ; CHECKBE-NEXT: vins.f16 s6, s10 -; CHECKBE-NEXT: vmovx.f16 s7, s15 ; CHECKBE-NEXT: vins.f16 s7, s11 ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll @@ -148,11 +148,11 @@ define arm_aapcs_vfpcc void @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) { ; CHECK-LABEL: vmovn64_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> @@ -163,11 +163,11 @@ define arm_aapcs_vfpcc void @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) { ; CHECK-LABEL: vmovn64_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov.f32 s11, s5 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> @@ -232,11 +232,11 @@ define arm_aapcs_vfpcc void @vmovn32_b2(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) { ; CHECK-LABEL: vmovn32_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmov.f32 s7, s2 +; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> @@ -247,11 +247,11 @@ define arm_aapcs_vfpcc void @vmovn32_b3(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) { ; CHECK-LABEL: vmovn32_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> @@ -314,15 +314,15 @@ define arm_aapcs_vfpcc void @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { ; CHECK-LABEL: vmovn16_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s9, s5 -; CHECK-NEXT: vins.f16 s9, s1 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s6 -; CHECK-NEXT: vins.f16 s10, s2 -; CHECK-NEXT: vmovx.f16 s11, s7 -; CHECK-NEXT: vins.f16 s11, s3 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmovx.f16 s5, s5 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vmovx.f16 s7, s7 +; CHECK-NEXT: vins.f16 s5, s1 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vins.f16 s7, s3 +; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> @@ -333,15 +333,15 @@ define arm_aapcs_vfpcc void @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { ; CHECK-LABEL: vmovn16_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s9, s1 -; CHECK-NEXT: vins.f16 s9, s5 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vmovx.f16 s10, s2 -; CHECK-NEXT: vins.f16 s10, s6 -; CHECK-NEXT: vmovx.f16 s11, s3 -; CHECK-NEXT: vins.f16 s11, s7 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmovx.f16 s1, s1 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vmovx.f16 s3, s3 +; CHECK-NEXT: vins.f16 s1, s5 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vins.f16 s3, s7 +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll @@ -190,15 +190,12 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_0ext(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0213_0ext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 -; CHECK-NEXT: vmov.f32 s18, s3 ; CHECK-NEXT: vmullb.s32 q2, q0, q3 -; CHECK-NEXT: vmullb.s32 q1, q4, q3 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmullb.s32 q1, q0, q3 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -213,15 +210,12 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0ext_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0ext_0213: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 -; CHECK-NEXT: vmov.f32 s18, s3 ; CHECK-NEXT: vmullb.s32 q2, q3, q0 -; CHECK-NEXT: vmullb.s32 q1, q3, q4 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmullb.s32 q1, q3, q0 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -241,8 +235,8 @@ ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s6, s7 ; CHECK-NEXT: umull lr, r12, r1, r0 ; CHECK-NEXT: umull r2, r5, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr @@ -252,10 +246,10 @@ ; CHECK-NEXT: mla r5, r3, r2, r5 ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: mla r1, r1, r0, r4 -; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: mla r3, r3, r0, r5 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: umull r5, lr, r4, r0 ; CHECK-NEXT: umull r3, r12, r1, r0 ; CHECK-NEXT: vmov q1[2], q1[0], r5, r3 @@ -286,8 +280,8 @@ ; CHECK-NEXT: asrs r4, r0, #31 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s6, s7 ; CHECK-NEXT: umull lr, r12, r0, r1 ; CHECK-NEXT: umull r2, r5, r0, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr @@ -296,10 +290,10 @@ ; CHECK-NEXT: mla r1, r4, r1, r2 ; CHECK-NEXT: asrs r2, r3, #31 ; CHECK-NEXT: mla r2, r0, r2, r5 -; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vmov r5, s4 ; CHECK-NEXT: mla r2, r4, r3, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 -; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: umull r3, lr, r0, r5 ; CHECK-NEXT: umull r2, r12, r0, r1 ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 @@ -474,15 +468,12 @@ define arm_aapcs_vfpcc <4 x i64> @zext32_0213_0ext(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: zext32_0213_0ext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 -; CHECK-NEXT: vmov.f32 s18, s3 ; CHECK-NEXT: vmullb.u32 q2, q0, q3 -; CHECK-NEXT: vmullb.u32 q1, q4, q3 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmullb.u32 q1, q0, q3 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -497,15 +488,12 @@ define arm_aapcs_vfpcc <4 x i64> @zext32_0ext_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: zext32_0ext_0213: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 -; CHECK-NEXT: vmov.f32 s18, s3 ; CHECK-NEXT: vmullb.u32 q2, q3, q0 -; CHECK-NEXT: vmullb.u32 q1, q3, q4 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmullb.u32 q1, q3, q0 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -522,13 +510,13 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: umull r1, r12, r1, r0 ; CHECK-NEXT: umull r3, r2, r3, r0 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r12 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: umull r1, r2, r1, r0 @@ -551,13 +539,13 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: umull r1, r12, r0, r1 ; CHECK-NEXT: umull r3, r2, r0, r3 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r12 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: umull r1, r2, r0, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll @@ -72,16 +72,16 @@ define <4 x i64> *@vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) { ; CHECK-LABEL: vst2_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: add.w r0, r1, #32 -; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov.f32 s9, s3 ; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 +; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: bx lr entry: @@ -144,11 +144,11 @@ define <4 x double> *@vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) { ; CHECK-LABEL: vst2_v2f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov.f64 d5, d0 ; CHECK-NEXT: vmov.f64 d0, d3 +; CHECK-NEXT: vmov.f64 d4, d2 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q2, [r1], #32 ; CHECK-NEXT: mov r0, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -44,8 +44,8 @@ ; CHECK-LABEL: vst2_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vst20.32 {q0, q1}, [r1] ; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! @@ -68,10 +68,10 @@ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #64] +; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #112] ; CHECK-NEXT: vldrw.u32 q3, [r0, #96] ; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] @@ -100,17 +100,17 @@ define void @vst2_v4i32_align1(<4 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vst2_v4i32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d3 -; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s5 ; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s15, s1 +; CHECK-NEXT: vmov.f32 s12, s4 ; CHECK-NEXT: vstrb.8 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov.f32 s15, s1 ; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: @@ -186,8 +186,8 @@ ; CHECK-LABEL: vst2_v16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vst20.16 {q0, q1}, [r1] ; CHECK-NEXT: vst21.16 {q0, q1}, [r1]! @@ -207,34 +207,31 @@ define void @vst2_v8i16_align1(<8 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vst2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmovx.f16 s1, s10 -; CHECK-NEXT: vmovx.f16 s12, s6 -; CHECK-NEXT: vins.f16 s1, s12 +; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vins.f16 s10, s6 -; CHECK-NEXT: vmov.f32 s0, s10 -; CHECK-NEXT: vmovx.f16 s12, s7 ; CHECK-NEXT: vmovx.f16 s3, s11 +; CHECK-NEXT: vmovx.f16 s6, s7 ; CHECK-NEXT: vins.f16 s11, s7 -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmovx.f16 s14, s4 -; CHECK-NEXT: vins.f16 s3, s12 -; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vins.f16 s3, s6 +; CHECK-NEXT: vmovx.f16 s6, s8 ; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vins.f16 s6, s4 +; CHECK-NEXT: vmovx.f16 s15, s9 +; CHECK-NEXT: vins.f16 s9, s5 ; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vmov.f32 s0, s10 +; CHECK-NEXT: vins.f16 s15, s4 +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vmov.f32 s13, s6 ; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vmovx.f16 s19, s9 -; CHECK-NEXT: vins.f16 s9, s5 -; CHECK-NEXT: vmov.f32 s18, s9 -; CHECK-NEXT: vins.f16 s19, s4 -; CHECK-NEXT: vstrb.8 q4, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov.f32 s14, s9 +; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -328,12 +325,12 @@ define void @vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) { ; CHECK-LABEL: vst2_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s5 ; CHECK-NEXT: vmov.f32 s0, s6 ; CHECK-NEXT: vstrb.8 q2, [r1], #16 ; CHECK-NEXT: vmov.f32 s1, s7 @@ -354,25 +351,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f64 d6, d1 -; CHECK-NEXT: vmov.f64 d10, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vmov.f32 s20, s6 ; CHECK-NEXT: vmov.f32 s21, s7 ; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s22, s10 ; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vmov.f32 s14, s18 ; CHECK-NEXT: vstrb.8 q1, [r1], #48 -; CHECK-NEXT: vmov.f32 s23, s11 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.f32 s22, s10 ; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vmov.f32 s23, s11 ; CHECK-NEXT: vstrw.32 q5, [r1, #-32] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -429,8 +426,8 @@ ; CHECK-LABEL: vst2_v8f32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vst20.32 {q0, q1}, [r1] ; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! @@ -453,10 +450,10 @@ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #64] +; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #112] ; CHECK-NEXT: vldrw.u32 q3, [r0, #96] ; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] @@ -485,17 +482,17 @@ define void @vst2_v4f32_align1(<4 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vst2_v4f32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d3 -; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s5 ; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s15, s1 +; CHECK-NEXT: vmov.f32 s12, s4 ; CHECK-NEXT: vstrb.8 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov.f32 s15, s1 ; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: @@ -543,19 +540,19 @@ ; CHECK-NEXT: vmov.32 q1[0], r3 ; CHECK-NEXT: vmov.32 q0[1], r12 ; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s4 +; CHECK-NEXT: vmovx.f16 s2, s0 ; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vins.f16 s1, s5 -; CHECK-NEXT: vins.f16 s10, s4 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vstrh.16 q1, [r1] +; CHECK-NEXT: vmovx.f16 s6, s5 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmov.f32 s9, s2 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s11, s4 +; CHECK-NEXT: vstrh.16 q2, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 @@ -589,8 +586,8 @@ ; CHECK-LABEL: vst2_v16f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vst20.16 {q2, q3}, [r1] ; CHECK-NEXT: vst21.16 {q2, q3}, [r1]! @@ -610,32 +607,32 @@ define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vst2_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vmovx.f16 s1, s6 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vins.f16 s1, s12 -; CHECK-NEXT: vins.f16 s6, s10 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vins.f16 s1, s0 ; CHECK-NEXT: vmovx.f16 s3, s7 -; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vins.f16 s6, s10 +; CHECK-NEXT: vins.f16 s3, s0 +; CHECK-NEXT: vmovx.f16 s10, s4 +; CHECK-NEXT: vmovx.f16 s0, s8 ; CHECK-NEXT: vins.f16 s7, s11 -; CHECK-NEXT: vins.f16 s3, s12 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmovx.f16 s14, s8 ; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vmovx.f16 s14, s5 +; CHECK-NEXT: vins.f16 s10, s0 +; CHECK-NEXT: vmovx.f16 s8, s5 ; CHECK-NEXT: vins.f16 s5, s9 -; CHECK-NEXT: vmovx.f16 s8, s9 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vins.f16 s8, s0 ; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vins.f16 s14, s8 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov.f32 s9, s12 -; CHECK-NEXT: vmov.f32 s10, s5 ; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vmov.f32 s13, s10 ; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vstrb.8 q2, [r1] +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov.f32 s15, s8 +; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 @@ -652,8 +649,8 @@ define void @vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) { ; CHECK-LABEL: vst2_v2f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov.f64 d4, d3 ; CHECK-NEXT: vmov.f64 d5, d1 ; CHECK-NEXT: vmov.f64 d3, d0 @@ -675,17 +672,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vmov.f64 d8, d4 ; CHECK-NEXT: vmov.f64 d9, d0 ; CHECK-NEXT: vmov.f64 d0, d5 ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f64 d4, d6 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f64 d4, d6 ; CHECK-NEXT: vmov.f64 d2, d7 ; CHECK-NEXT: vstrw.32 q2, [r1, #32] ; CHECK-NEXT: vstrw.32 q1, [r1, #48] diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -12,16 +12,15 @@ ; CHECK-NEXT: ldrd r3, r2, [r0, #8] ; CHECK-NEXT: ldrd r4, r0, [r0, #16] ; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: vmov.32 q0[0], r4 ; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 -; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q0[0], r4 ; CHECK-NEXT: vmov.f32 s8, s7 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.f32 s9, s6 ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: strd r2, r0, [r1, #16] ; CHECK-NEXT: pop {r4, pc} @@ -44,22 +43,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s17, s4 ; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s19, s13 ; CHECK-NEXT: vmov.f32 s9, s1 ; CHECK-NEXT: vmov.f32 s18, s0 ; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vstrw.32 q4, [r1] ; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vmov.f32 s1, s15 ; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vmov.f32 s16, s12 ; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s19, s13 +; CHECK-NEXT: vmov.f32 s1, s15 +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.f32 s2, s7 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -84,41 +83,41 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] +; CHECK-NEXT: vldrw.u32 q7, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vmov.f64 d10, d8 +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmov.f32 s21, s28 -; CHECK-NEXT: vmov.f64 d14, d12 -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vmov.f32 s29, s12 -; CHECK-NEXT: vmov.f32 s9, s27 -; CHECK-NEXT: vmov.f32 s31, s25 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s20, s28 +; CHECK-NEXT: vmov.f32 s9, s19 +; CHECK-NEXT: vmov.f32 s28, s16 +; CHECK-NEXT: vmov.f32 s31, s17 +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vmov.f32 s23, s29 +; CHECK-NEXT: vstrw.32 q2, [r1, #80] +; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vmov.f32 s21, s24 +; CHECK-NEXT: vmov.f32 s29, s12 +; CHECK-NEXT: vstrw.32 q5, [r1] ; CHECK-NEXT: vmov.f32 s30, s0 ; CHECK-NEXT: vmov.f32 s0, s13 ; CHECK-NEXT: vstrw.32 q7, [r1, #48] ; CHECK-NEXT: vmov.f32 s3, s14 -; CHECK-NEXT: vmov.f32 s2, s26 -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vstrw.32 q0, [r1, #64] -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vstrw.32 q2, [r1, #80] -; CHECK-NEXT: vmov.f32 s12, s25 ; CHECK-NEXT: vmov.f32 s13, s5 -; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vstrw.32 q0, [r1, #64] ; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vmov.f32 s12, s25 ; CHECK-NEXT: vmov.f32 s15, s26 -; CHECK-NEXT: vmov.f32 s5, s19 ; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s6, s27 +; CHECK-NEXT: vmov.f32 s5, s19 ; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f32 s6, s27 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -142,108 +141,106 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #160 -; CHECK-NEXT: sub sp, #160 +; CHECK-NEXT: .pad #144 +; CHECK-NEXT: sub sp, #144 ; CHECK-NEXT: vldrw.u32 q7, [r0, #96] +; CHECK-NEXT: vldrw.u32 q3, [r0, #160] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vldrw.u32 q2, [r0, #128] -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q7, [r0, #80] -; CHECK-NEXT: vmov.f32 s16, s1 -; CHECK-NEXT: vldrw.u32 q3, [r0, #160] -; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q7, [r0, #48] -; CHECK-NEXT: vmov.f32 s17, s9 ; CHECK-NEXT: vstrw.32 q3, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vstrw.32 q7, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vldrw.u32 q5, [r0, #144] -; CHECK-NEXT: vldrw.u32 q1, [r0, #176] +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q7, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vstrw.32 q4, [r1, #16] -; CHECK-NEXT: vmov.f64 d8, d5 +; CHECK-NEXT: vmov.f32 s16, s1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #176] +; CHECK-NEXT: vmov.f32 s19, s2 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vldrw.u32 q7, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [r0, #144] +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: vmov.f32 s17, s27 ; CHECK-NEXT: vmov.f32 s19, s11 -; CHECK-NEXT: vmov.f32 s18, s3 ; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.f64 d8, d3 -; CHECK-NEXT: vmov.f32 s17, s31 +; CHECK-NEXT: vmov.f32 s16, s6 ; CHECK-NEXT: vmov.f32 s19, s7 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d8, d12 -; CHECK-NEXT: vmov.f32 s17, s0 -; CHECK-NEXT: vmov.f32 s19, s25 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s18, s31 +; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s18, s8 -; CHECK-NEXT: vmov q2, q7 -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f64 d4, d14 ; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov.f32 s0, s12 +; CHECK-NEXT: vmov.f64 d14, d4 ; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s7, s14 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.f32 s16, s24 +; CHECK-NEXT: vmov.f32 s19, s25 +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d0, d14 +; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vmov.f32 s7, s30 ; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s31, s1 +; CHECK-NEXT: vmov.f64 d0, d10 ; CHECK-NEXT: vmov.f32 s16, s5 -; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vmov.f32 s19, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d12, d11 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov q0, q7 -; CHECK-NEXT: vmov.f32 s25, s7 -; CHECK-NEXT: vstrw.32 q4, [r1, #112] -; CHECK-NEXT: vmov.f32 s27, s23 -; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vldrw.u32 q3, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s29, s20 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov.f32 s31, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vstrw.32 q2, [r1, #128] -; CHECK-NEXT: vmov.f32 s30, s0 -; CHECK-NEXT: vstrw.32 q6, [r1, #80] -; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.f32 s29, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s24, s2 +; CHECK-NEXT: vmov.f32 s30, s4 +; CHECK-NEXT: vmov.f32 s27, s3 ; CHECK-NEXT: vstrw.32 q7, [r1, #96] -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vstrw.32 q0, [r1, #48] +; CHECK-NEXT: vmov.f32 s4, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vmov.f64 d10, d0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vstrw.32 q4, [r1, #112] ; CHECK-NEXT: vstrw.32 q0, [r1, #144] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s15, s6 ; CHECK-NEXT: vstrw.32 q0, [r1, #160] -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s13, s11 +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1, #176] -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [r1, #64] +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s23 +; CHECK-NEXT: vstrw.32 q3, [r1, #128] +; CHECK-NEXT: vmov.f32 s26, s11 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #160 +; CHECK-NEXT: vmov.f32 s6, s20 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmov.f32 s20, s9 +; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vmov.f32 s23, s10 +; CHECK-NEXT: vstrw.32 q5, [r1, #64] +; CHECK-NEXT: add sp, #144 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -303,23 +300,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q2, [r0, #16] ; CHECK-NEXT: vldrh.u32 q1, [r0] ; CHECK-NEXT: vldrh.u32 q0, [r0, #8] -; CHECK-NEXT: vmov.f64 d6, d5 -; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vldrh.u32 q2, [r0, #16] ; CHECK-NEXT: vmov r0, r5, d2 +; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov lr, r4, d1 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov r12, s6 +; CHECK-NEXT: vmov.32 q1[2], r4 ; CHECK-NEXT: vmov r0, r4, d4 +; CHECK-NEXT: vstrh.32 q1, [r1, #16] ; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r12, s6 ; CHECK-NEXT: vmov.16 q0[3], r5 -; CHECK-NEXT: vstrh.32 q3, [r1, #16] ; CHECK-NEXT: vmov.16 q0[4], r3 ; CHECK-NEXT: vmov.16 q0[5], r4 ; CHECK-NEXT: vmov.16 q0[6], r12 @@ -343,64 +340,52 @@ define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) { ; CHECK-LABEL: vst3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vmov.f32 s0, s8 ; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmovx.f16 s20, s8 ; CHECK-NEXT: vins.f16 s0, s4 ; CHECK-NEXT: vmov.f32 s12, s9 ; CHECK-NEXT: vins.f16 s12, s5 ; CHECK-NEXT: vmov.16 q0[4], r2 ; CHECK-NEXT: vmov.f32 s3, s12 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s18, s12 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmovx.f16 s20, s18 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmovx.f16 s20, s14 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vmovx.f16 s8, s8 ; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vins.f16 s16, s20 -; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmov.f32 s1, s12 ; CHECK-NEXT: vins.f16 s17, s7 -; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vins.f16 s1, s8 +; CHECK-NEXT: vmovx.f16 s8, s12 +; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vins.f16 s16, s8 ; CHECK-NEXT: vmovx.f16 s19, s7 -; CHECK-NEXT: vrev32.16 q1, q1 -; CHECK-NEXT: vins.f16 s19, s20 -; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmovx.f16 s8, s15 ; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vins.f16 s21, s24 -; CHECK-NEXT: vmovx.f16 s24, s22 -; CHECK-NEXT: vins.f16 s18, s24 -; CHECK-NEXT: vmov.f32 s12, s13 -; CHECK-NEXT: vmov.f32 s22, s18 -; CHECK-NEXT: vmov.f32 s17, s21 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmovx.f16 s20, s9 -; CHECK-NEXT: vins.f16 s12, s20 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vins.f16 s14, s20 +; CHECK-NEXT: vins.f16 s19, s8 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmov.f32 s17, s11 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s17, s8 +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vins.f16 s18, s8 +; CHECK-NEXT: vmov.f32 s8, s13 +; CHECK-NEXT: vins.f16 s8, s12 +; CHECK-NEXT: vmovx.f16 s12, s10 +; CHECK-NEXT: vins.f16 s14, s12 +; CHECK-NEXT: vrev32.16 q1, q1 +; CHECK-NEXT: vmovx.f16 s12, s13 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vins.f16 s5, s12 +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmov.f32 s9, s5 ; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.f32 s15, s14 -; CHECK-NEXT: vmov.f32 s14, s10 -; CHECK-NEXT: vmovx.f16 s8, s13 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vins.f16 s14, s8 -; CHECK-NEXT: vmov.f32 s6, s14 -; CHECK-NEXT: vmov.f32 s13, s5 -; CHECK-NEXT: vmov.f32 s14, s6 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -421,135 +406,112 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #80 -; CHECK-NEXT: sub sp, #80 -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vmovx.f16 s8, s22 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s0, s15 -; CHECK-NEXT: vins.f16 s9, s23 -; CHECK-NEXT: vmov.u16 r2, q6[1] -; CHECK-NEXT: vmovx.f16 s11, s23 -; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s11, s0 -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vmovx.f16 s4, s9 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s1, s11 -; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vins.f16 s1, s4 -; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmovx.f16 s7, s7 +; CHECK-NEXT: vmov.f32 s12, s4 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov.f32 s18, s11 +; CHECK-NEXT: vmov.f32 s15, s7 +; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vins.f16 s13, s4 +; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vins.f16 s18, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s28, s4 -; CHECK-NEXT: vins.f16 s8, s24 -; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.f32 s11, s5 -; CHECK-NEXT: vins.f16 s11, s25 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov.f32 s20, s24 +; CHECK-NEXT: vins.f16 s20, s4 +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s23, s25 +; CHECK-NEXT: vmovx.f16 s4, s24 ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.u16 r0, q5[1] +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vins.f16 s23, s5 +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s5, s24 -; CHECK-NEXT: vmov.f32 s6, s24 -; CHECK-NEXT: vins.f16 s5, s28 -; CHECK-NEXT: vmovx.f16 s28, s6 -; CHECK-NEXT: vins.f16 s10, s28 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov.f64 d14, d2 -; CHECK-NEXT: vins.f16 s28, s20 -; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vins.f16 s0, s21 +; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s28, s0 +; CHECK-NEXT: vins.f16 s5, s4 +; CHECK-NEXT: vmovx.f16 s4, s24 +; CHECK-NEXT: vmov.u16 r0, q3[1] +; CHECK-NEXT: vins.f16 s28, s12 +; CHECK-NEXT: vins.f16 s22, s4 +; CHECK-NEXT: vmov.f32 s4, s1 ; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.f32 s31, s0 -; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s29, s4 -; CHECK-NEXT: vmovx.f16 s4, s4 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vins.f16 s1, s4 -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vins.f16 s30, s4 -; CHECK-NEXT: vmovx.f16 s4, s26 -; CHECK-NEXT: vmov.f32 s2, s30 -; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vmov.f32 s12, s13 -; CHECK-NEXT: vmov.f32 s30, s2 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: vins.f16 s4, s13 +; CHECK-NEXT: vmov.f32 s21, s5 +; CHECK-NEXT: vmov.f32 s31, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmov.f32 s29, s8 +; CHECK-NEXT: vins.f16 s29, s0 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vins.f16 s30, s0 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vins.f16 s5, s7 +; CHECK-NEXT: vmovx.f16 s7, s7 +; CHECK-NEXT: vmovx.f16 s0, s27 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov.f32 s13, s19 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s27 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov.f32 s12, s25 +; CHECK-NEXT: vins.f16 s14, s0 +; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vins.f16 s0, s6 +; CHECK-NEXT: vmovx.f16 s6, s2 +; CHECK-NEXT: vins.f16 s10, s6 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s5, s13 ; CHECK-NEXT: vstrw.32 q7, [r1, #48] -; CHECK-NEXT: vmovx.f16 s3, s7 -; CHECK-NEXT: vmovx.f16 s4, s27 -; CHECK-NEXT: vins.f16 s3, s4 -; CHECK-NEXT: vmov.f32 s5, s23 -; CHECK-NEXT: vmov.f32 s2, s27 -; CHECK-NEXT: vmovx.f16 s16, s1 -; CHECK-NEXT: vmov.f32 s6, s23 -; CHECK-NEXT: vins.f16 s5, s16 +; CHECK-NEXT: vrev32.16 q2, q2 +; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vins.f16 s9, s6 +; CHECK-NEXT: vmovx.f16 s6, s10 +; CHECK-NEXT: vins.f16 s12, s8 +; CHECK-NEXT: vmovx.f16 s8, s18 +; CHECK-NEXT: vmov.f32 s10, s18 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s20, s6 -; CHECK-NEXT: vmov.f32 s24, s25 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmovx.f16 s20, s17 -; CHECK-NEXT: vins.f16 s12, s20 -; CHECK-NEXT: vmovx.f16 s20, s18 -; CHECK-NEXT: vins.f16 s14, s20 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s15, s14 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmovx.f16 s16, s13 -; CHECK-NEXT: vstr s16, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s1, s5 -; CHECK-NEXT: vrev32.16 q5, q4 -; CHECK-NEXT: vldr s16, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: vins.f16 s21, s16 -; CHECK-NEXT: vmovx.f16 s16, s22 -; CHECK-NEXT: vins.f16 s14, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmovx.f16 s4, s17 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vins.f16 s24, s4 -; CHECK-NEXT: vmovx.f16 s4, s18 -; CHECK-NEXT: vins.f16 s26, s4 -; CHECK-NEXT: vmov.f32 s13, s21 -; CHECK-NEXT: vmov.f32 s27, s26 -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s26, s18 -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s4, s25 +; CHECK-NEXT: vins.f16 s26, s8 +; CHECK-NEXT: vmov.f32 s15, s26 +; CHECK-NEXT: vmovx.f16 s8, s25 +; CHECK-NEXT: vrev32.16 q6, q4 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vins.f16 s25, s8 +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vmovx.f16 s8, s26 +; CHECK-NEXT: vstrw.32 q0, [r1, #64] ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vrev32.16 q4, q4 -; CHECK-NEXT: vins.f16 s17, s4 -; CHECK-NEXT: vmovx.f16 s4, s18 -; CHECK-NEXT: vins.f16 s26, s4 -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vstrw.32 q3, [r1, #64] -; CHECK-NEXT: vmov.f32 s25, s17 +; CHECK-NEXT: vins.f16 s10, s8 +; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vmov.f32 s14, s10 +; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: vmov.f32 s13, s25 ; CHECK-NEXT: vstrw.32 q0, [r1, #80] -; CHECK-NEXT: vmov.f32 s26, s18 -; CHECK-NEXT: vstrw.32 q6, [r1, #16] -; CHECK-NEXT: add sp, #80 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -664,29 +626,26 @@ define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) { ; CHECK-LABEL: vst3_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vldrb.u16 q1, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrb.u16 q2, [r0, #8] -; CHECK-NEXT: vmovx.f16 s12, s6 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s0, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vldrb.u16 q1, [r0, #16] +; CHECK-NEXT: vldrb.u16 q3, [r0] ; CHECK-NEXT: vins.f16 s1, s11 +; CHECK-NEXT: vmovx.f16 s2, s6 +; CHECK-NEXT: vmovx.f16 s0, s10 ; CHECK-NEXT: vmovx.f16 s3, s11 -; CHECK-NEXT: vins.f16 s3, s12 -; CHECK-NEXT: vldrb.u16 q3, [r0] -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmovx.f16 s20, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s7 +; CHECK-NEXT: vmovx.f16 s1, s1 ; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vins.f16 s3, s2 +; CHECK-NEXT: vins.f16 s17, s1 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vmovx.f16 s1, s15 ; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmovx.f16 s20, s18 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vins.f16 s2, s1 ; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f32 s2, s18 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q2[0] ; CHECK-NEXT: vstrb.16 q0, [r1, #16] @@ -720,7 +679,7 @@ ; CHECK-NEXT: vmov.u16 r0, q3[5] ; CHECK-NEXT: vmov.8 q4[15], r0 ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0 @@ -957,11 +916,9 @@ define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) { ; CHECK-LABEL: vst3_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f64 d6, d5 -; CHECK-NEXT: vmov.f32 s13, s11 ; CHECK-NEXT: vmov.f32 s14, s2 ; CHECK-NEXT: vmov.f32 s15, s3 ; CHECK-NEXT: vmov.f32 s2, s6 @@ -969,8 +926,10 @@ ; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: vmov.f32 s7, s9 ; CHECK-NEXT: vstrb.8 q1, [r1], #32 -; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vmov.f32 s12, s10 +; CHECK-NEXT: vmov.f32 s13, s11 ; CHECK-NEXT: vstrw.32 q0, [r1, #-16] +; CHECK-NEXT: vstrw.32 q3, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0 @@ -991,41 +950,37 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.f64 d10, d2 -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s16, s14 +; CHECK-NEXT: vldrw.u32 q6, [r0, #16] +; CHECK-NEXT: vmov.f32 s17, s15 ; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vmov.f64 d7, d15 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.f32 s19, s3 +; CHECK-NEXT: vmov.f32 s20, s4 +; CHECK-NEXT: vstrw.32 q4, [r1, #80] ; CHECK-NEXT: vmov.f32 s21, s5 ; CHECK-NEXT: vmov.f32 s22, s28 ; CHECK-NEXT: vmov.f32 s23, s29 -; CHECK-NEXT: vmov.f64 d14, d12 +; CHECK-NEXT: vmov.f32 s4, s8 ; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vmov.f32 s5, s9 +; CHECK-NEXT: vmov.f32 s28, s24 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vmov.f32 s29, s25 -; CHECK-NEXT: vmov.f64 d8, d7 ; CHECK-NEXT: vmov.f32 s30, s12 -; CHECK-NEXT: vmov.f32 s17, s15 ; CHECK-NEXT: vmov.f32 s31, s13 -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vstrw.32 q7, [r1, #48] -; CHECK-NEXT: vmov.f32 s4, s8 -; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.f32 s2, s26 -; CHECK-NEXT: vstrw.32 q4, [r1, #80] -; CHECK-NEXT: vmov.f32 s5, s9 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vstrw.32 q7, [r1, #48] ; CHECK-NEXT: vmov.f32 s3, s27 -; CHECK-NEXT: vmov.f32 s9, s15 +; CHECK-NEXT: vmov.f32 s8, s14 ; CHECK-NEXT: vstrw.32 q0, [r1, #64] +; CHECK-NEXT: vmov.f32 s9, s15 ; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1047,10 +1002,10 @@ define void @vst3_v2f32(<2 x float> *%src, <6 x float> *%dst) { ; CHECK-LABEL: vst3_v2f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0, #20] ; CHECK-NEXT: vldr s0, [r0] ; CHECK-NEXT: vldr s3, [r0, #4] ; CHECK-NEXT: vldr s1, [r0, #8] -; CHECK-NEXT: ldr r2, [r0, #20] ; CHECK-NEXT: vldr s2, [r0, #16] ; CHECK-NEXT: ldr r0, [r0, #12] ; CHECK-NEXT: strd r0, r2, [r1, #16] @@ -1075,22 +1030,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s17, s0 ; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f32 s19, s13 ; CHECK-NEXT: vmov.f32 s9, s5 ; CHECK-NEXT: vmov.f32 s18, s4 ; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vstrw.32 q4, [r1] ; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov.f32 s5, s15 ; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s16, s12 ; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s19, s13 +; CHECK-NEXT: vmov.f32 s5, s15 +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -1115,41 +1070,41 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vmov.f64 d10, d8 -; CHECK-NEXT: vldrw.u32 q7, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmov.f32 s21, s24 -; CHECK-NEXT: vmov.f64 d12, d4 -; CHECK-NEXT: vmov.f64 d6, d1 -; CHECK-NEXT: vmov.f32 s25, s28 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov.f32 s27, s9 +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s20, s24 +; CHECK-NEXT: vmov.f32 s13, s19 +; CHECK-NEXT: vmov.f32 s24, s16 +; CHECK-NEXT: vmov.f32 s27, s17 +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmov.f32 s23, s25 +; CHECK-NEXT: vstrw.32 q3, [r1, #80] +; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vmov.f32 s21, s28 +; CHECK-NEXT: vmov.f32 s25, s8 +; CHECK-NEXT: vstrw.32 q5, [r1] ; CHECK-NEXT: vmov.f32 s26, s0 -; CHECK-NEXT: vmov.f32 s0, s29 +; CHECK-NEXT: vmov.f32 s0, s9 ; CHECK-NEXT: vstrw.32 q6, [r1, #48] -; CHECK-NEXT: vmov.f32 s3, s30 -; CHECK-NEXT: vmov.f32 s14, s31 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.f32 s8, s29 -; CHECK-NEXT: vstrw.32 q0, [r1, #64] +; CHECK-NEXT: vmov.f32 s3, s10 ; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vstrw.32 q0, [r1, #64] ; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vmov.f32 s8, s29 ; CHECK-NEXT: vmov.f32 s11, s30 -; CHECK-NEXT: vmov.f32 s5, s19 ; CHECK-NEXT: vmov.f32 s10, s18 -; CHECK-NEXT: vmov.f32 s6, s31 +; CHECK-NEXT: vmov.f32 s5, s19 ; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s6, s31 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -1173,107 +1128,106 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #160 -; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: .pad #144 +; CHECK-NEXT: sub sp, #144 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vstrw.32 q5, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] ; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] +; CHECK-NEXT: vldrw.u32 q7, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0, #160] +; CHECK-NEXT: vstrw.32 q5, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vstrw.32 q7, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s13, s9 ; CHECK-NEXT: vmov.f32 s15, s2 -; CHECK-NEXT: vldrw.u32 q4, [r0, #160] -; CHECK-NEXT: vstrw.32 q5, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #176] ; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q4, [sp, #128] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q4, [r0, #144] -; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q7, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #176] -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] ; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vmov.f64 d6, d5 -; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s12, s10 ; CHECK-NEXT: vmov.f32 s13, s27 ; CHECK-NEXT: vmov.f32 s15, s11 -; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vmov.f64 d6, d3 ; CHECK-NEXT: vmov.f32 s13, s23 +; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.f32 s15, s7 ; CHECK-NEXT: vmov.f32 s14, s31 -; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d6, d12 +; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s15, s25 ; CHECK-NEXT: vmov.f32 s14, s8 -; CHECK-NEXT: vmov q2, q7 -; CHECK-NEXT: vmov.f64 d0, d10 -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f64 d4, d14 +; CHECK-NEXT: vmov.f32 s0, s20 ; CHECK-NEXT: vmov.f32 s3, s21 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vmov.f64 d10, d2 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f64 d14, d2 ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s6, s22 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov.f32 s20, s5 +; CHECK-NEXT: vmov.f32 s20, s9 +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s23, s30 +; CHECK-NEXT: vmov.f32 s12, s24 +; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s15, s25 +; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s21, s1 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d12, d9 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov q0, q7 -; CHECK-NEXT: vmov.f32 s25, s7 -; CHECK-NEXT: vstrw.32 q5, [r1, #112] -; CHECK-NEXT: vmov.f32 s27, s19 -; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s29, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d0, d14 +; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s31, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vstrw.32 q2, [r1, #128] -; CHECK-NEXT: vmov.f32 s30, s0 -; CHECK-NEXT: vstrw.32 q6, [r1, #80] -; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: vmov.f64 d0, d8 +; CHECK-NEXT: vmov.f32 s20, s9 +; CHECK-NEXT: vmov.f32 s23, s10 +; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmov.f32 s29, s8 +; CHECK-NEXT: vldrw.u32 q2, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s24, s2 +; CHECK-NEXT: vmov.f32 s30, s8 +; CHECK-NEXT: vmov.f32 s27, s3 ; CHECK-NEXT: vstrw.32 q7, [r1, #96] -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vstrw.32 q0, [r1, #48] +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f64 d8, d0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vstrw.32 q5, [r1, #112] ; CHECK-NEXT: vstrw.32 q0, [r1, #144] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s19, s14 ; CHECK-NEXT: vstrw.32 q0, [r1, #160] -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1, #176] -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q4, [r1, #64] +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s19 +; CHECK-NEXT: vstrw.32 q3, [r1, #128] +; CHECK-NEXT: vmov.f32 s26, s7 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #160 +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s19, s6 +; CHECK-NEXT: vstrw.32 q4, [r1, #64] +; CHECK-NEXT: add sp, #144 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1297,14 +1251,14 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldmia r0, {s0, s1} ; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s2, s0 ; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vins.f16 s4, s2 ; CHECK-NEXT: vmovx.f16 s2, s1 -; CHECK-NEXT: vmovx.f16 s10, s4 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vins.f16 s2, s10 ; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vins.f16 s2, s6 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: stm r1!, {r0, r2, r3} @@ -1328,8 +1282,6 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: ldrd r2, r12, [r0] ; CHECK-NEXT: ldrd r3, lr, [r0, #8] ; CHECK-NEXT: vmov.32 q0[0], r2 @@ -1337,30 +1289,29 @@ ; CHECK-NEXT: vmov.32 q1[0], r3 ; CHECK-NEXT: vmov.32 q0[1], r12 ; CHECK-NEXT: vmov.32 q1[1], lr -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmovx.f16 s10, s0 ; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: vins.f16 s8, s5 +; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmovx.f16 s13, s3 +; CHECK-NEXT: vmovx.f16 s6, s0 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s10, s4 ; CHECK-NEXT: vmovx.f16 s2, s2 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vins.f16 s4, s10 -; CHECK-NEXT: vins.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmovx.f16 s17, s3 -; CHECK-NEXT: vins.f16 s5, s10 -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s2, s10 +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vins.f16 s13, s10 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vmov.f32 s3, s8 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov r0, r2, d8 +; CHECK-NEXT: vmov r0, r2, d6 ; CHECK-NEXT: strd r0, r2, [r1, #16] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 @@ -1379,65 +1330,53 @@ define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) { ; CHECK-LABEL: vst3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vmovx.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vins.f16 s0, s20 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vins.f16 s4, s21 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmovx.f16 s2, s12 +; CHECK-NEXT: vins.f16 s0, s12 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.f32 s3, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vins.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s15, s23 -; CHECK-NEXT: vins.f16 s15, s24 -; CHECK-NEXT: vmovx.f16 s24, s6 -; CHECK-NEXT: vmovx.f16 s12, s22 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vins.f16 s12, s24 -; CHECK-NEXT: vmov.f32 s25, s11 -; CHECK-NEXT: vins.f16 s13, s23 -; CHECK-NEXT: vmov.f32 s26, s11 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmovx.f16 s28, s13 -; CHECK-NEXT: vins.f16 s25, s28 -; CHECK-NEXT: vmovx.f16 s28, s26 -; CHECK-NEXT: vins.f16 s14, s28 -; CHECK-NEXT: vmovx.f16 s28, s9 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vrev32.16 q5, q5 -; CHECK-NEXT: vins.f16 s4, s28 -; CHECK-NEXT: vmovx.f16 s28, s10 -; CHECK-NEXT: vins.f16 s6, s28 -; CHECK-NEXT: vmov.f32 s26, s14 -; CHECK-NEXT: vmov.f32 s7, s6 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vins.f16 s21, s8 -; CHECK-NEXT: vmovx.f16 s8, s22 -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s13, s25 -; CHECK-NEXT: vmov.f32 s5, s21 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vmov.f32 s1, s16 +; CHECK-NEXT: vmovx.f16 s11, s15 +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s16 +; CHECK-NEXT: vins.f16 s8, s13 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s19 +; CHECK-NEXT: vmov.f32 s3, s8 +; CHECK-NEXT: vins.f16 s11, s4 +; CHECK-NEXT: vmovx.f16 s4, s18 +; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vins.f16 s9, s15 +; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s4, s9 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vins.f16 s9, s4 +; CHECK-NEXT: vrev32.16 q3, q3 +; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vmovx.f16 s4, s7 +; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmov.f32 s12, s17 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] +; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vins.f16 s18, s4 +; CHECK-NEXT: vmovx.f16 s4, s17 +; CHECK-NEXT: vins.f16 s13, s4 +; CHECK-NEXT: vmovx.f16 s4, s14 +; CHECK-NEXT: vins.f16 s6, s4 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.f32 s14, s6 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vmov.f32 s6, s22 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 @@ -1458,150 +1397,121 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #128 -; CHECK-NEXT: sub sp, #128 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vmovx.f16 s7, s15 +; CHECK-NEXT: .pad #96 +; CHECK-NEXT: sub sp, #96 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vmovx.f16 s7, s11 ; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q6, q2 +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vins.f16 s1, s11 +; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmov.f64 d14, d12 -; CHECK-NEXT: vins.f16 s5, s15 -; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov.f32 s5, s27 -; CHECK-NEXT: vmov.f32 s6, s27 -; CHECK-NEXT: vins.f16 s28, s12 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s5, s11 ; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vins.f16 s10, s0 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d2, d10 -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmovx.f16 s2, s8 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s0, s21 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vins.f16 s0, s9 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmovx.f16 s2, s12 -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vmov.f32 s5, s20 -; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.f32 s9, s20 -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.f32 s10, s20 -; CHECK-NEXT: vins.f16 s9, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmov q7, q4 ; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmov.f32 s0, s25 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vins.f16 s0, s13 -; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmovx.f16 s2, s20 +; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s4, s16 +; CHECK-NEXT: vins.f16 s4, s20 +; CHECK-NEXT: vmov.f32 s0, s17 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vmovx.f16 s4, s28 +; CHECK-NEXT: vldrw.u32 q7, [r0, #80] +; CHECK-NEXT: vmov.16 q4[4], r2 +; CHECK-NEXT: vins.f16 s0, s21 +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s19, s0 +; CHECK-NEXT: vmovx.f16 s0, s28 +; CHECK-NEXT: vins.f16 s18, s0 +; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmovx.f16 s8, s24 +; CHECK-NEXT: vmov.f32 s22, s28 +; CHECK-NEXT: vins.f16 s20, s24 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f32 s17, s28 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vins.f16 s17, s4 +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmov.f32 s21, s12 +; CHECK-NEXT: vmovx.f16 s24, s10 +; CHECK-NEXT: vins.f16 s21, s0 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vins.f16 s22, s0 +; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vins.f16 s24, s0 +; CHECK-NEXT: vmovx.f16 s0, s31 +; CHECK-NEXT: vmovx.f16 s27, s11 +; CHECK-NEXT: vins.f16 s4, s25 +; CHECK-NEXT: vins.f16 s27, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s25, s11 +; CHECK-NEXT: vmov.f32 s23, s4 +; CHECK-NEXT: vmovx.f16 s4, s25 +; CHECK-NEXT: vmov.f32 s25, s3 +; CHECK-NEXT: vmov.f32 s26, s31 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vins.f16 s25, s4 +; CHECK-NEXT: vins.f16 s26, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmov.f32 s0, s29 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vins.f16 s30, s4 +; CHECK-NEXT: vmov.f32 s6, s18 +; CHECK-NEXT: vrev32.16 q2, q2 +; CHECK-NEXT: vmovx.f16 s4, s29 +; CHECK-NEXT: vmov.f32 s3, s30 +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s9, s4 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s29 +; CHECK-NEXT: vmov.f32 s8, s13 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s4, s30 +; CHECK-NEXT: vins.f16 s14, s4 +; CHECK-NEXT: vmov.f32 s10, s30 +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vmovx.f16 s4, s13 ; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s31, s0 -; CHECK-NEXT: vmovx.f16 s0, s24 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmov.f32 s29, s24 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s30, s0 -; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vldrw.u32 q7, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vrev32.16 q3, q3 +; CHECK-NEXT: vmov.f32 s6, s30 +; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s13, s4 ; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s23 -; CHECK-NEXT: vmovx.f16 s7, s15 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vins.f16 s5, s15 -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s6, s23 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vmov.f32 s1, s15 -; CHECK-NEXT: vmov.f32 s2, s15 -; CHECK-NEXT: vins.f16 s1, s16 -; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vins.f16 s6, s16 -; CHECK-NEXT: vmovx.f16 s16, s13 -; CHECK-NEXT: vmov.f32 s20, s21 -; CHECK-NEXT: vins.f16 s20, s16 -; CHECK-NEXT: vmovx.f16 s16, s14 -; CHECK-NEXT: vins.f16 s22, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s23, s22 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s30 -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmovx.f16 s12, s21 -; CHECK-NEXT: vstr s12, [sp, #64] @ 4-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vrev32.16 q4, q3 -; CHECK-NEXT: vldr s12, [sp, #64] @ 4-byte Reload -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vins.f16 s22, s12 -; CHECK-NEXT: vmovx.f16 s12, s25 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vins.f16 s8, s12 -; CHECK-NEXT: vmovx.f16 s0, s26 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vins.f16 s10, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s11, s10 -; CHECK-NEXT: vstrw.32 q1, [r1, #80] -; CHECK-NEXT: vmov.f32 s10, s26 -; CHECK-NEXT: vrev32.16 q6, q0 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s25, s12 -; CHECK-NEXT: vmovx.f16 s12, s26 -; CHECK-NEXT: vins.f16 s10, s12 -; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s30, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s10 -; CHECK-NEXT: vmov.f32 s1, s13 -; CHECK-NEXT: vstrw.32 q7, [r1] -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vmov.f32 s13, s1 -; CHECK-NEXT: vmov.f32 s21, s17 -; CHECK-NEXT: vmov.f32 s9, s25 -; CHECK-NEXT: vmov.f32 s22, s18 -; CHECK-NEXT: vmov.f32 s10, s26 -; CHECK-NEXT: vstrw.32 q5, [r1, #64] +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmov.f32 s9, s13 +; CHECK-NEXT: vmov.f32 s4, s28 ; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: add sp, #128 +; CHECK-NEXT: vmov.f32 s7, s31 +; CHECK-NEXT: vstrw.32 q4, [r1, #48] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: vstrw.32 q0, [r1, #64] +; CHECK-NEXT: add sp, #96 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1623,8 +1533,8 @@ define void @vst3_v2f64(<2 x double> *%src, <6 x double> *%dst) { ; CHECK-LABEL: vst3_v2f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vmov.f64 d6, d2 ; CHECK-NEXT: vmov.f64 d7, d1 @@ -1653,32 +1563,28 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q7, [r0, #48] ; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov.f64 d6, d15 -; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmov.f64 d10, d2 +; CHECK-NEXT: vmov.f64 d15, d13 ; CHECK-NEXT: vmov.f64 d7, d1 -; CHECK-NEXT: vmov.f64 d11, d12 +; CHECK-NEXT: vmov.f64 d10, d2 ; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f64 d12, d4 +; CHECK-NEXT: vmov.f64 d11, d12 +; CHECK-NEXT: vmov.f64 d2, d8 ; CHECK-NEXT: vstrw.32 q5, [r1] ; CHECK-NEXT: vmov.f64 d1, d5 -; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d2, d8 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vmov.f64 d8, d15 ; CHECK-NEXT: vstrw.32 q0, [r1, #64] +; CHECK-NEXT: vmov.f64 d12, d4 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] ; CHECK-NEXT: vmov.f64 d13, d14 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmov.f64 d8, d5 ; CHECK-NEXT: vstrw.32 q6, [r1, #48] -; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll @@ -104,21 +104,21 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f64 d4, d8 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vmov.f32 s8, s16 ; CHECK-NEXT: vmov.f32 s9, s17 ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s1 ; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vmov.f32 s1, s19 -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s17, s13 ; CHECK-NEXT: vmov.f32 s18, s4 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vmov.f32 s19, s5 +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vmov.f32 s17, s13 ; CHECK-NEXT: vmov.f32 s4, s14 ; CHECK-NEXT: vstrw.32 q4, [r1, #16] ; CHECK-NEXT: vmov.f32 s5, s15 @@ -215,16 +215,16 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vmov.f64 d2, d6 ; CHECK-NEXT: vmov.f64 d3, d0 ; CHECK-NEXT: vmov.f64 d0, d7 -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vmov.f64 d7, d4 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] +; CHECK-NEXT: vmov.f64 d6, d8 ; CHECK-NEXT: vmov.f64 d4, d9 ; CHECK-NEXT: vstrw.32 q3, [r1, #16] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -13,20 +13,20 @@ ; CHECK-NEXT: ldrd r3, r2, [r0, #8] ; CHECK-NEXT: ldm r6, {r4, r5, r6} ; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 ; CHECK-NEXT: ldr r0, [r0, #28] +; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r6 -; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r0 ; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vmov.f32 s6, s1 ; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 @@ -78,12 +78,12 @@ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #96] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] @@ -120,55 +120,50 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #192 ; CHECK-NEXT: sub sp, #192 -; CHECK-NEXT: add r2, sp, #128 -; CHECK-NEXT: vldrw.u32 q7, [r0, #240] -; CHECK-NEXT: vldrw.u32 q3, [r0, #208] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q7, [r0, #224] +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q3, [r0, #208] ; CHECK-NEXT: vldrw.u32 q2, [r0, #144] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] ; CHECK-NEXT: add r2, sp, #128 -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q3, [r0, #192] +; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vldrw.u32 q4, [r0, #240] +; CHECK-NEXT: vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q6, [r0, #160] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q5, [r0, #96] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #192] -; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vldrw.u32 q3, [r0, #224] +; CHECK-NEXT: vldrw.u32 q1, [r0, #96] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vmov q6, q2 +; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] @@ -210,28 +205,28 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f32 s12, s17 -; CHECK-NEXT: vmov.f64 d10, d8 -; CHECK-NEXT: vmov.f32 s24, s19 -; CHECK-NEXT: vmov.f32 s13, s9 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vmov.f32 s14, s1 ; CHECK-NEXT: vmov.f32 s22, s0 ; CHECK-NEXT: vmov.f32 s26, s3 -; CHECK-NEXT: vmov.f32 s0, s18 -; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s12, s17 +; CHECK-NEXT: vmov.f32 s13, s9 ; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s23, s4 +; CHECK-NEXT: vmov.f32 s20, s16 ; CHECK-NEXT: vstrb.8 q3, [r1, #16] -; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vmov.f32 s21, s8 +; CHECK-NEXT: vmov.f32 s23, s4 +; CHECK-NEXT: vmov.f32 s24, s19 ; CHECK-NEXT: vstrb.8 q5, [r1] -; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vstrb.8 q6, [r1, #48] +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vstrb.8 q0, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr @@ -385,12 +380,12 @@ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #96] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1] @@ -421,61 +416,61 @@ define void @vst4_v8i16_align1(<8 x i16> *%src, <32 x i16> *%dst) { ; CHECK-LABEL: vst4_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] ; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vins.f16 s5, s9 ; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmovx.f16 s20, s17 +; CHECK-NEXT: vins.f16 s5, s9 ; CHECK-NEXT: vins.f16 s12, s0 ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov.f32 s3, s12 -; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vmovx.f16 s27, s4 ; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmov.f32 s3, s12 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmov.f32 s5, s4 +; CHECK-NEXT: vmovx.f16 s8, s8 +; CHECK-NEXT: vmovx.f16 s0, s17 ; CHECK-NEXT: vmovx.f16 s2, s13 +; CHECK-NEXT: vins.f16 s27, s8 +; CHECK-NEXT: vmovx.f16 s4, s12 +; CHECK-NEXT: vmovx.f16 s8, s16 ; CHECK-NEXT: vins.f16 s13, s17 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmovx.f16 s20, s8 -; CHECK-NEXT: vins.f16 s27, s20 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vmovx.f16 s28, s12 -; CHECK-NEXT: vmovx.f16 s20, s16 ; CHECK-NEXT: vins.f16 s12, s16 -; CHECK-NEXT: vins.f16 s28, s20 ; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov.f32 s25, s4 -; CHECK-NEXT: vmov.f32 s22, s28 -; CHECK-NEXT: vmovx.f16 s28, s11 -; CHECK-NEXT: vmov.f32 s21, s4 -; CHECK-NEXT: vmovx.f16 s8, s10 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vmovx.f16 s4, s11 ; CHECK-NEXT: vmov.f32 s23, s27 ; CHECK-NEXT: vmovx.f16 s27, s7 -; CHECK-NEXT: vins.f16 s27, s28 ; CHECK-NEXT: vins.f16 s7, s11 -; CHECK-NEXT: vmov.f32 s25, s7 -; CHECK-NEXT: vmovx.f16 s28, s19 +; CHECK-NEXT: vins.f16 s27, s4 ; CHECK-NEXT: vmovx.f16 s26, s15 -; CHECK-NEXT: vins.f16 s15, s19 -; CHECK-NEXT: vins.f16 s26, s28 -; CHECK-NEXT: vmovx.f16 s31, s6 +; CHECK-NEXT: vmovx.f16 s4, s19 +; CHECK-NEXT: vmov.f32 s25, s7 +; CHECK-NEXT: vins.f16 s26, s4 +; CHECK-NEXT: vmovx.f16 s7, s6 +; CHECK-NEXT: vmovx.f16 s4, s10 ; CHECK-NEXT: vins.f16 s6, s10 -; CHECK-NEXT: vins.f16 s31, s8 -; CHECK-NEXT: vmov.f32 s29, s6 +; CHECK-NEXT: vmov.f32 s21, s5 +; CHECK-NEXT: vins.f16 s15, s19 +; CHECK-NEXT: vins.f16 s7, s4 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmovx.f16 s6, s14 ; CHECK-NEXT: vmovx.f16 s4, s18 -; CHECK-NEXT: vmovx.f16 s30, s14 -; CHECK-NEXT: vmov.f32 s24, s15 ; CHECK-NEXT: vins.f16 s14, s18 -; CHECK-NEXT: vins.f16 s30, s4 -; CHECK-NEXT: vmov.f32 s28, s14 +; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmov.f32 s0, s13 +; CHECK-NEXT: vmov.f32 s24, s15 +; CHECK-NEXT: vins.f16 s6, s4 +; CHECK-NEXT: vmov.f32 s4, s14 ; CHECK-NEXT: vstrb.8 q6, [r1, #48] -; CHECK-NEXT: vstrb.8 q7, [r1, #32] +; CHECK-NEXT: vstrb.8 q1, [r1, #32] ; CHECK-NEXT: vstrb.8 q0, [r1, #16] ; CHECK-NEXT: vstrb.8 q5, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -723,25 +718,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vmov.f32 s13, s17 -; CHECK-NEXT: vmov.f32 s21, s9 +; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vmov.f32 s14, s0 -; CHECK-NEXT: vmov.f32 s22, s4 ; CHECK-NEXT: vmov.f32 s15, s1 +; CHECK-NEXT: vmov.f32 s22, s4 ; CHECK-NEXT: vmov.f32 s23, s5 +; CHECK-NEXT: vmov.f32 s12, s16 +; CHECK-NEXT: vmov.f32 s13, s17 +; CHECK-NEXT: vmov.f32 s20, s8 ; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f32 s21, s9 ; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vstrw.32 q5, [r1] -; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.f32 s1, s19 -; CHECK-NEXT: vmov.f32 s5, s11 +; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vstrw.32 q0, [r1, #48] +; CHECK-NEXT: vmov.f32 s5, s11 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -766,57 +761,56 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmov.f64 d12, d14 +; CHECK-NEXT: vldrw.u32 q7, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #96] -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s25, s29 -; CHECK-NEXT: vmov.f32 s26, s0 -; CHECK-NEXT: vmov.f32 s27, s1 -; CHECK-NEXT: vmov.f32 s0, s30 -; CHECK-NEXT: vstrw.32 q6, [r1] -; CHECK-NEXT: vmov.f32 s1, s31 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d2, d6 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vmov.f64 d14, d0 -; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vmov.f64 d13, d1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vmov.f32 s4, s28 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s5, s29 +; CHECK-NEXT: vmov.f32 s24, s30 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s25, s31 +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vmov.f32 s4, s12 +; CHECK-NEXT: vmov.f32 s5, s13 ; CHECK-NEXT: vmov.f32 s8, s14 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vmov.f32 s9, s15 -; CHECK-NEXT: vmov.f64 d6, d0 +; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f64 d1, d15 ; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s13, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f64 d13, d7 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vmov.f32 s15, s21 ; CHECK-NEXT: vmov.f32 s30, s16 +; CHECK-NEXT: vstrw.32 q3, [r1, #80] ; CHECK-NEXT: vmov.f32 s31, s17 +; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s16, s2 ; CHECK-NEXT: vstrw.32 q7, [r1, #64] ; CHECK-NEXT: vmov.f32 s17, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vstrw.32 q4, [r1, #96] -; CHECK-NEXT: vmov.f32 s15, s21 -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f32 s21, s3 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s20, s26 +; CHECK-NEXT: vstrw.32 q4, [r1, #96] +; CHECK-NEXT: vmov.f32 s21, s27 +; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vstrw.32 q5, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -901,12 +895,12 @@ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #96] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] @@ -943,55 +937,50 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #192 ; CHECK-NEXT: sub sp, #192 -; CHECK-NEXT: add r2, sp, #128 -; CHECK-NEXT: vldrw.u32 q7, [r0, #240] -; CHECK-NEXT: vldrw.u32 q3, [r0, #208] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q7, [r0, #224] +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q3, [r0, #208] ; CHECK-NEXT: vldrw.u32 q2, [r0, #144] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] ; CHECK-NEXT: add r2, sp, #128 -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q3, [r0, #192] +; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vldrw.u32 q4, [r0, #240] +; CHECK-NEXT: vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q6, [r0, #160] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q5, [r0, #96] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #192] -; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vldrw.u32 q3, [r0, #224] +; CHECK-NEXT: vldrw.u32 q1, [r0, #96] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vmov q6, q2 +; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] @@ -1033,28 +1022,28 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f32 s12, s17 -; CHECK-NEXT: vmov.f64 d10, d8 -; CHECK-NEXT: vmov.f32 s24, s19 -; CHECK-NEXT: vmov.f32 s13, s9 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vmov.f32 s14, s1 ; CHECK-NEXT: vmov.f32 s22, s0 ; CHECK-NEXT: vmov.f32 s26, s3 -; CHECK-NEXT: vmov.f32 s0, s18 -; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s12, s17 +; CHECK-NEXT: vmov.f32 s13, s9 ; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s23, s4 +; CHECK-NEXT: vmov.f32 s20, s16 ; CHECK-NEXT: vstrb.8 q3, [r1, #16] -; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vmov.f32 s21, s8 +; CHECK-NEXT: vmov.f32 s23, s4 +; CHECK-NEXT: vmov.f32 s24, s19 ; CHECK-NEXT: vstrb.8 q5, [r1] -; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vstrb.8 q6, [r1, #48] +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vstrb.8 q0, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr @@ -1079,17 +1068,18 @@ define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vst4_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldmia r0, {s0, s1} +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldr s5, [r0, #4] ; CHECK-NEXT: vldr s4, [r0, #8] ; CHECK-NEXT: vmovx.f16 s2, s0 -; CHECK-NEXT: vldr s5, [r0, #12] -; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vldr s1, [r0, #12] +; CHECK-NEXT: vmovx.f16 s6, s5 ; CHECK-NEXT: vmovx.f16 s3, s4 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vins.f16 s3, s8 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s4, s1 +; CHECK-NEXT: vins.f16 s0, s5 +; CHECK-NEXT: vins.f16 s3, s6 ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr @@ -1122,37 +1112,33 @@ ; CHECK-NEXT: vmov.32 q0[0], r3 ; CHECK-NEXT: vmov.32 q0[1], r12 ; CHECK-NEXT: ldrd r2, r12, [r0] -; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: ldrd r3, r0, [r0, #8] -; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vins.f16 s0, s4 ; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmov.32 q2[1], r0 ; CHECK-NEXT: vmov.32 q1[1], r12 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s7, s9 -; CHECK-NEXT: vmovx.f16 s14, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vins.f16 s14, s8 +; CHECK-NEXT: vins.f16 s12, s2 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vins.f16 s6, s2 ; CHECK-NEXT: vmovx.f16 s11, s1 -; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vmovx.f16 s13, s3 -; CHECK-NEXT: vins.f16 s11, s13 +; CHECK-NEXT: vmovx.f16 s2, s3 ; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmovx.f16 s13, s7 -; CHECK-NEXT: vins.f16 s10, s13 +; CHECK-NEXT: vins.f16 s11, s2 +; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vins.f16 s4, s8 ; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s5, s0 -; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vins.f16 s10, s2 ; CHECK-NEXT: vmov.f32 s9, s1 -; CHECK-NEXT: vmov.f32 s7, s12 +; CHECK-NEXT: vmov.f32 s5, s0 ; CHECK-NEXT: vstrh.16 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s7, s12 ; CHECK-NEXT: vstrh.16 q1, [r1] ; CHECK-NEXT: pop {r7, pc} entry: @@ -1205,12 +1191,12 @@ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #96] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1] @@ -1241,70 +1227,61 @@ define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) { ; CHECK-LABEL: vst4_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrw.u32 q7, [r0, #48] +; CHECK-NEXT: .vsave {d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] ; CHECK-NEXT: vldrw.u32 q6, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vmovx.f16 s0, s29 -; CHECK-NEXT: vins.f16 s2, s0 -; CHECK-NEXT: vmovx.f16 s12, s25 -; CHECK-NEXT: vstr s2, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmovx.f16 s2, s21 +; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vmovx.f16 s2, s9 -; CHECK-NEXT: vins.f16 s5, s29 -; CHECK-NEXT: vins.f16 s2, s12 +; CHECK-NEXT: vmovx.f16 s12, s25 ; CHECK-NEXT: vmovx.f16 s19, s4 -; CHECK-NEXT: vmovx.f16 s12, s28 -; CHECK-NEXT: vins.f16 s9, s25 +; CHECK-NEXT: vins.f16 s2, s12 +; CHECK-NEXT: vmovx.f16 s12, s20 ; CHECK-NEXT: vins.f16 s19, s12 -; CHECK-NEXT: vmovx.f16 s14, s8 -; CHECK-NEXT: vmovx.f16 s12, s24 -; CHECK-NEXT: vins.f16 s14, s12 -; CHECK-NEXT: vins.f16 s4, s28 -; CHECK-NEXT: vstr s14, [sp] @ 4-byte Spill +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vmovx.f16 s14, s24 ; CHECK-NEXT: vmovx.f16 s15, s7 -; CHECK-NEXT: vmovx.f16 s20, s31 -; CHECK-NEXT: vins.f16 s8, s24 -; CHECK-NEXT: vins.f16 s15, s20 -; CHECK-NEXT: vmovx.f16 s20, s27 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmovx.f16 s14, s23 +; CHECK-NEXT: vins.f16 s15, s14 ; CHECK-NEXT: vmovx.f16 s14, s11 -; CHECK-NEXT: vins.f16 s7, s31 -; CHECK-NEXT: vins.f16 s14, s20 +; CHECK-NEXT: vmovx.f16 s1, s27 +; CHECK-NEXT: vins.f16 s7, s23 +; CHECK-NEXT: vins.f16 s14, s1 ; CHECK-NEXT: vmovx.f16 s23, s6 -; CHECK-NEXT: vmovx.f16 s28, s30 -; CHECK-NEXT: vins.f16 s6, s30 -; CHECK-NEXT: vins.f16 s23, s28 -; CHECK-NEXT: vins.f16 s11, s27 +; CHECK-NEXT: vmovx.f16 s1, s22 +; CHECK-NEXT: vins.f16 s6, s22 +; CHECK-NEXT: vins.f16 s5, s21 +; CHECK-NEXT: vins.f16 s4, s20 +; CHECK-NEXT: vins.f16 s23, s1 ; CHECK-NEXT: vmovx.f16 s22, s10 -; CHECK-NEXT: vmovx.f16 s24, s26 -; CHECK-NEXT: vldr s28, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vins.f16 s22, s24 ; CHECK-NEXT: vins.f16 s10, s26 +; CHECK-NEXT: vmovx.f16 s1, s26 +; CHECK-NEXT: vins.f16 s9, s25 +; CHECK-NEXT: vins.f16 s8, s24 +; CHECK-NEXT: vins.f16 s11, s27 ; CHECK-NEXT: vmov q6, q1 -; CHECK-NEXT: vmov.f32 s27, s28 -; CHECK-NEXT: vldr s28, [sp] @ 4-byte Reload -; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vins.f16 s22, s1 ; CHECK-NEXT: vmov.f32 s1, s25 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s21, s6 -; CHECK-NEXT: vmov.f32 s12, s11 -; CHECK-NEXT: vmov.f32 s20, s10 -; CHECK-NEXT: vstrb.8 q3, [r1, #48] -; CHECK-NEXT: vmov.f32 s3, s27 ; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov.f32 s26, s28 -; CHECK-NEXT: vstrb.8 q5, [r1, #32] -; CHECK-NEXT: vmov.f32 s25, s4 +; CHECK-NEXT: vmov.f32 s3, s0 +; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vmov.f32 s26, s12 ; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s25, s4 ; CHECK-NEXT: vmov.f32 s27, s19 +; CHECK-NEXT: vmov.f32 s13, s7 ; CHECK-NEXT: vstrb.8 q6, [r1] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.f32 s12, s11 +; CHECK-NEXT: vmov.f32 s21, s6 +; CHECK-NEXT: vstrb.8 q3, [r1, #48] +; CHECK-NEXT: vmov.f32 s20, s10 +; CHECK-NEXT: vstrb.8 q5, [r1, #32] +; CHECK-NEXT: vpop {d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 @@ -1329,15 +1306,15 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vmov.f64 d9, d0 +; CHECK-NEXT: vmov.f64 d8, d4 ; CHECK-NEXT: vmov.f64 d11, d2 ; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f64 d10, d6 ; CHECK-NEXT: vmov.f64 d0, d5 ; CHECK-NEXT: vstrw.32 q5, [r1] ; CHECK-NEXT: vmov.f64 d2, d7 @@ -1369,32 +1346,32 @@ ; CHECK-NEXT: .pad #64 ; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: vldrw.u32 q7, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d14, d12 +; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d15, d10 +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #112] ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d15, d10 +; CHECK-NEXT: vmov.f64 d14, d12 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov.f64 d14, d4 ; CHECK-NEXT: vmov.f64 d15, d2 -; CHECK-NEXT: vmov.f64 d2, d5 ; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.f64 d4, d0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.f64 d10, d13 -; CHECK-NEXT: vmov.f64 d12, d0 +; CHECK-NEXT: vmov.f64 d2, d5 ; CHECK-NEXT: vstrw.32 q5, [r1, #32] ; CHECK-NEXT: vmov.f64 d5, d6 +; CHECK-NEXT: vstrw.32 q1, [r1, #48] ; CHECK-NEXT: vmov.f64 d13, d8 ; CHECK-NEXT: vstrw.32 q2, [r1, #64] +; CHECK-NEXT: vmov.f64 d12, d0 ; CHECK-NEXT: vmov.f64 d8, d1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q6, [r1, #80] diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll @@ -56,14 +56,14 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q4, [r0] -; CHECK-NEXT: vmov.f64 d0, d8 ; CHECK-NEXT: vmov.i64 q5, #0xffffffff +; CHECK-NEXT: vmov.f32 s0, s16 ; CHECK-NEXT: vmov.f32 s2, s17 ; CHECK-NEXT: vand q6, q0, q5 ; CHECK-NEXT: vmov r0, r1, d13 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: vmov.f64 d0, d9 +; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vmov.f32 s2, s19 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: vand q5, q0, q5