Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -1833,6 +1833,13 @@ def MVE_VMOV_from_lane_u8 : MVE_VMOV_lane_8 < "u8", 0b1, MVE_VMOV_from_lane>; def MVE_VMOV_to_lane_8 : MVE_VMOV_lane_8 < "8", 0b0, MVE_VMOV_to_lane>; +// This is the same as insertelt but allows the inserted value to be an i32 as +// will be used when it is the only legal type. +def ARMVecInsert : SDTypeProfile<1, 3, [ + SDTCisVT<2, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<3> +]>; +def ARMinsertelt : SDNode<"ISD::INSERT_VECTOR_ELT", ARMVecInsert>; + let Predicates = [HasMVEInt] in { def : Pat<(extractelt (v2f64 MQPR:$src), imm:$lane), (f64 (EXTRACT_SUBREG MQPR:$src, (DSubReg_f64_reg imm:$lane)))>; @@ -1893,6 +1900,58 @@ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), (f16 HPR:$src), ssub_0)>; def : Pat<(v8f16 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; + + foreach LANE = [0, 2, 4, 6] in { + defvar SSUB = !cast("ssub_"#!srl(LANE, 1)); + + // v8f16 pattern for extracting 2 even lane elements and inserting them using a VINS + def : Pat<(insertelt (insertelt (v8f16 MQPR:$srcV), + (extractelt (v8f16 MQPR:$src1), imm_even:$lane1), + LANE), + (extractelt (v8f16 MQPR:$src2), imm_even:$lane2), + !add(LANE,1)), + (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)), + (VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_even:$lane1)), + (EXTRACT_SUBREG MQPR:$src2, (SSubReg_f16_reg imm_even:$lane2))), + SSUB), MQPR)>; + + // v8f16 pattern for extracting an element using VMOVX and inserting another using a VINS + def : Pat<(insertelt (insertelt (v8f16 MQPR:$srcV), + (extractelt (v8f16 MQPR:$src1), imm_odd:$lane1), + LANE), + (extractelt (v8f16 MQPR:$src2), imm_even:$lane2), + !add(LANE,1)), + (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)), + (VINSH (VMOVH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_even:$lane1))), + (EXTRACT_SUBREG MQPR:$src2, (SSubReg_f16_reg imm_even:$lane2))), + SSUB), MQPR)>; + + // v8i16 pattern for extracting 2 even lane elements and inserting them using a VINS + def : Pat<(ARMinsertelt (ARMinsertelt (v8i16 MQPR:$srcV), + (ARMvgetlaneu (v8i16 MQPR:$src1), imm_even:$lane1), + LANE), + (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$lane2), + !add(LANE,1)), + (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)), + (VINSH (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), + (SSubReg_f16_reg imm_even:$lane1)), + (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src2, MQPR)), + (SSubReg_f16_reg imm_even:$lane2))), + SSUB), MQPR)>; + + // v8i16 pattern for extracting an element using VMOVX and inserting another using a VINS + def : Pat<(ARMinsertelt (ARMinsertelt (v8i16 MQPR:$srcV), + (ARMvgetlaneu (v8i16 MQPR:$src1), imm_odd:$lane1), + LANE), + (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$lane2), + !add(LANE,1)), + (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)), + (VINSH (VMOVH (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), + (SSubReg_f16_reg imm_even:$lane1))), + (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src2, MQPR)), + (SSubReg_f16_reg imm_even:$lane2))), + SSUB), MQPR)>; + } } // end of mve_bit instructions Index: llvm/lib/Target/ARM/ARMInstrVFP.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrVFP.td +++ llvm/lib/Target/ARM/ARMInstrVFP.td @@ -1126,9 +1126,12 @@ Requires<[HasFullFP16]>; def VINSH : ASuInp<0b11101, 0b11, 0b0000, 0b11, 0, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs SPR:$Sd), (ins SPR:$Sda, SPR:$Sm), IIC_fpUNA16, "vins.f16\t$Sd, $Sm", []>, - Requires<[HasFullFP16]>; + Requires<[HasFullFP16]> { + let Constraints = "$Sd = $Sda"; +} + } // PostEncoderMethod } // hasSideEffects Index: llvm/test/CodeGen/Thumb2/mve-shuffle.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -62,23 +62,15 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) { ; CHECK-LABEL: shuffle1_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vins.f16 s4, s3 +; CHECK-NEXT: vmovx.f16 s5, s2 +; CHECK-NEXT: vins.f16 s5, s2 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s6, s1 +; CHECK-NEXT: vmovx.f16 s7, s0 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -97,20 +89,16 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { ; CHECK-LABEL: shuffle3_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmovx.f16 s5, s3 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vins.f16 s5, s3 +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -345,27 +333,15 @@ define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) { ; CHECK-LABEL: shuffle1_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> @@ -384,21 +360,16 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { ; CHECK-LABEL: shuffle3_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s5, s3 +; CHECK-NEXT: vins.f16 s5, s3 +; CHECK-NEXT: vins.f16 s1, s0 ; CHECK-NEXT: vmov.16 q1[4], r0 ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr Index: llvm/test/CodeGen/Thumb2/mve-shufflemov.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shufflemov.ll +++ llvm/test/CodeGen/Thumb2/mve-shufflemov.ll @@ -35,23 +35,15 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_76543210: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vins.f16 s4, s3 +; CHECK-NEXT: vmovx.f16 s5, s2 +; CHECK-NEXT: vins.f16 s5, s2 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s6, s1 +; CHECK-NEXT: vmovx.f16 s7, s0 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> @@ -346,27 +338,15 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_f16_76543210(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_76543210: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %s1, <8 x half> %s2, <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-vld2.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -205,41 +205,33 @@ define void @vld2_v8i16_align1(<16 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q1, [r0] -; CHECK-NEXT: vldrb.u8 q2, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vldrb.u8 q2, [r0] +; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vmov.f64 d2, d4 ; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.u16 r0, q2[5] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vins.f16 s4, s9 ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov.f32 s5, s10 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vins.f16 s5, s11 ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vins.f16 s6, s1 ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.f32 s7, s2 ; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vins.f16 s7, s3 ; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vadd.i16 q0, q3, q0 +; CHECK-NEXT: vadd.i16 q0, q1, q3 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -558,16 +550,13 @@ ; CHECK-NEXT: ldr r0, [r0, #4] ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.16 q1[0], r0 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] @@ -585,27 +574,22 @@ ; CHECK-LABEL: vld2_v4f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov.16 q1[0], r2 ; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.16 q1[2], r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmovx.f16 s12, s2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmovx.f16 s12, s3 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vadd.f16 q0, q1, q2 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: strd r0, r2, [r1] @@ -662,49 +646,41 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8} ; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vldrb.u8 q2, [r0] +; CHECK-NEXT: vldrb.u8 q3, [r0] ; CHECK-NEXT: vldrb.u8 q1, [r0, #16] +; CHECK-NEXT: vmov.f64 d0, d6 +; CHECK-NEXT: vmovx.f16 s8, s12 +; CHECK-NEXT: vmovx.f16 s16, s14 +; CHECK-NEXT: vmovx.f16 s12, s15 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s13 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmovx.f16 s16, s10 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov.16 q2[1], r2 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vadd.f16 q0, q0, q3 +; CHECK-NEXT: vins.f16 s0, s13 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmov.f32 s1, s14 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s5 +; CHECK-NEXT: vins.f16 s1, s15 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s6 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vins.f16 s2, s5 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr Index: llvm/test/CodeGen/Thumb2/mve-vld3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -330,73 +330,67 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q0[4] ; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.u16 r0, q0[7] ; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.f32 s22, s8 -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmovnb.i32 q3, q4 +; CHECK-NEXT: vmov.f32 s22, s0 +; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmovnb.i32 q3, q4 ; CHECK-NEXT: vmov r0, s14 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r0 ; CHECK-NEXT: vmov r0, s23 ; CHECK-NEXT: vmov r2, s17 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.u16 r0, q2[0] ; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r0, q2[6] ; CHECK-NEXT: vmov.16 q4[2], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] ; CHECK-NEXT: vmov.16 q4[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.u16 r0, q0[5] ; CHECK-NEXT: vmov.16 q5[7], r0 ; CHECK-NEXT: vmov.u16 r0, q1[7] ; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vins.f16 s20, s10 +; CHECK-NEXT: vmovx.f16 s21, s11 +; CHECK-NEXT: vins.f16 s21, s5 ; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.16 q1[5], r0 ; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovnb.i32 q1, q5 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 -; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmovx.f16 s7, s1 +; CHECK-NEXT: vins.f16 s7, s3 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmovnb.i32 q0, q5 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 -; CHECK-NEXT: vadd.i16 q0, q4, q1 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 +; CHECK-NEXT: vadd.i16 q0, q4, q0 ; CHECK-NEXT: vadd.i16 q0, q0, q3 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} @@ -417,144 +411,132 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov.u16 r2, q3[0] +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.u16 r2, q3[3] +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov.u16 r2, q3[6] +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[1] +; CHECK-NEXT: vmov.16 q1[3], r2 ; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov.u16 r2, q4[2] ; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.u16 r2, q4[5] ; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s8 -; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.u16 r2, q4[4] +; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmov.u16 r2, q3[2] +; CHECK-NEXT: vmov.f32 s7, s23 +; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov.u16 r2, q3[5] +; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[0] +; CHECK-NEXT: vmov.16 q5[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov.u16 r2, q4[7] +; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov.f32 s26, s16 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmovnb.i32 q7, q5 +; CHECK-NEXT: vmov r12, s27 +; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r12 +; CHECK-NEXT: vmov.u16 r2, q4[0] +; CHECK-NEXT: vmov.16 q5[5], r2 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmovx.f16 s23, s17 +; CHECK-NEXT: vins.f16 s23, s19 +; CHECK-NEXT: vmovx.f16 s16, s12 +; CHECK-NEXT: vins.f16 s16, s14 +; CHECK-NEXT: vmovx.f16 s17, s15 +; CHECK-NEXT: vins.f16 s17, s9 +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vmov.16 q4[4], r2 +; CHECK-NEXT: vmovnb.i32 q2, q4 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmovnb.i32 q3, q4 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov r2, s23 ; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vadd.i16 q1, q1, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.u16 r0, q5[4] +; CHECK-NEXT: vadd.i16 q1, q1, q6 +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmov.16 q3[1], r2 ; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r0, q5[7] +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.16 q6[7], r0 ; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov.f32 s26, s20 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov q4, q6 +; CHECK-NEXT: vmovnb.i32 q4, q3 +; CHECK-NEXT: vmov.u16 r2, q2[0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov r3, s27 ; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: vmov.u16 r2, q2[6] ; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q4[5], r2 ; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov.16 q4[3], r2 ; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov.16 q4[4], r2 +; CHECK-NEXT: vmov.u16 r2, q5[2] +; CHECK-NEXT: vmov.16 q7[6], r2 +; CHECK-NEXT: vmov.u16 r2, q5[5] +; CHECK-NEXT: vmov.16 q7[7], r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmovnb.i32 q1, q5 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vadd.i16 q0, q4, q1 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q3[1] -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.f32 s7, s23 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.16 q7[7], r0 -; CHECK-NEXT: vmov.f32 s22, s15 -; CHECK-NEXT: vmov.f32 s30, s16 -; CHECK-NEXT: vmov q6, q7 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovnb.i32 q6, q5 -; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov.16 q4[5], r2 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r0 -; CHECK-NEXT: vmov r0, s31 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov q6[3], q6[1], r0, r3 +; CHECK-NEXT: vmov.u16 r0, q5[0] +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmovx.f16 s15, s21 +; CHECK-NEXT: vmov.f32 s19, s31 +; CHECK-NEXT: vins.f16 s15, s23 +; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vins.f16 s20, s10 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vmovx.f16 s21, s11 +; CHECK-NEXT: vins.f16 s21, s1 +; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmovnb.i32 q0, q5 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, s15 ; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q7[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q7[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q7[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q7[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov q2, q5 -; CHECK-NEXT: vmovnb.i32 q2, q7 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r2, s29 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 -; CHECK-NEXT: vadd.i16 q1, q1, q2 -; CHECK-NEXT: vadd.i16 q1, q1, q6 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 +; CHECK-NEXT: vadd.i16 q0, q4, q0 +; CHECK-NEXT: vadd.i16 q0, q0, q6 +; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -653,56 +635,54 @@ define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) { ; CHECK-LABEL: vld3_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrb.u16 q1, [r0, #16] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrb.u16 q0, [r0, #16] +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[0] ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.u8 r0, q1[3] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.u8 r0, q1[6] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.u8 r0, q1[9] ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.u8 r0, q1[12] ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.u8 r0, q1[15] ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmovx.f16 s11, s1 ; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vins.f16 s11, s3 ; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vadd.i16 q2, q2, q3 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vadd.i16 q2, q3, q2 ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.u8 r0, q1[5] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.u8 r0, q1[8] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.u8 r0, q1[11] ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.u8 r0, q1[14] ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.u16 r0, q0[4] ; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r0, q0[7] ; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vadd.i16 q0, q2, q3 ; CHECK-NEXT: vstrb.16 q0, [r1] @@ -1228,18 +1208,15 @@ ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.32 q0[1], r3 ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmov.16 q1[1], r0 ; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vadd.f16 q1, q2, q1 +; CHECK-NEXT: vadd.f16 q1, q1, q2 ; CHECK-NEXT: vmovx.f16 s8, s2 ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov.16 q0[0], r2 @@ -1262,48 +1239,39 @@ define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) { ; CHECK-LABEL: vld3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmovx.f16 s12, s1 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.16 q2[1], r3 +; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: vmov.16 q2[2], r2 ; CHECK-NEXT: ldrd r2, r0, [r0, #16] ; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmovx.f16 s16, s4 -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmovx.f16 s12, s4 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vins.f16 s12, s2 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmovx.f16 s13, s3 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q0[0], r0 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vins.f16 s13, s5 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vadd.f16 q2, q3, q2 +; CHECK-NEXT: vadd.f16 q2, q2, q3 ; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vadd.f16 q0, q2, q0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <12 x half>, <12 x half>* %src, align 4 @@ -1321,8 +1289,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] @@ -1344,37 +1312,25 @@ ; CHECK-NEXT: vmov.16 q3[6], r3 ; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s24, s11 ; CHECK-NEXT: vmov r5, s8 ; CHECK-NEXT: vmov.f32 s14, s16 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov.16 q5[4], r2 ; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov r0, s19 ; CHECK-NEXT: vmov r12, s22 -; CHECK-NEXT: vmovx.f16 s20, s17 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov lr, s23 -; CHECK-NEXT: vmovx.f16 s20, s6 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov r3, s22 ; CHECK-NEXT: vmovx.f16 s20, s8 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vins.f16 s20, s10 +; CHECK-NEXT: vmovx.f16 s21, s11 +; CHECK-NEXT: vins.f16 s21, s5 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov lr, s21 +; CHECK-NEXT: vmovx.f16 s23, s17 +; CHECK-NEXT: vins.f16 s23, s19 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmovx.f16 s22, s6 +; CHECK-NEXT: vins.f16 s22, s16 +; CHECK-NEXT: vmov r2, s22 ; CHECK-NEXT: vmovx.f16 s20, s9 -; CHECK-NEXT: vmov r0, s21 ; CHECK-NEXT: vmov r4, s20 ; CHECK-NEXT: vmov.16 q5[0], r5 ; CHECK-NEXT: vmov.16 q5[1], r4 @@ -1396,15 +1352,15 @@ ; CHECK-NEXT: vmov.16 q2[7], r5 ; CHECK-NEXT: vmov q1[2], q1[0], r4, r12 ; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: vmov q0[3], q0[1], r0, lr +; CHECK-NEXT: vmov q0[3], q0[1], lr, r0 ; CHECK-NEXT: vmov.f32 s23, s11 ; CHECK-NEXT: vmov q1[3], q1[1], r4, r5 ; CHECK-NEXT: vadd.f16 q0, q5, q0 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %l1 = load <24 x half>, <24 x half>* %src, align 4 @@ -1422,171 +1378,147 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmovx.f16 s20, s17 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.16 q0[7], r3 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmovx.f16 s20, s13 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov r12, s8 +; CHECK-NEXT: vmovx.f16 s8, s18 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.16 q1[7], r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov.16 q2[1], r3 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.16 q2[2], r2 ; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmovx.f16 s28, s16 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vmovx.f16 s24, s17 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov.16 q5[4], r2 ; CHECK-NEXT: vmov r2, s9 ; CHECK-NEXT: vmov.16 q5[5], r12 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vmov r12, s22 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.16 q5[7], r3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q6[0], r3 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmovx.f16 s28, s19 -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmovx.f16 s28, s18 -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmovx.f16 s8, s9 -; CHECK-NEXT: vmov.16 q7[5], r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q2[6], r3 -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov lr, s30 -; CHECK-NEXT: vmov r6, s11 -; CHECK-NEXT: vmovx.f16 s8, s12 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmovx.f16 s12, s15 -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov.16 q2[1], r4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.16 q2[2], r3 -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: vmov.16 q2[3], r3 +; CHECK-NEXT: vmovx.f16 s20, s16 +; CHECK-NEXT: vins.f16 s20, s18 +; CHECK-NEXT: vmovx.f16 s21, s19 +; CHECK-NEXT: vins.f16 s21, s13 +; CHECK-NEXT: vmov r6, s20 +; CHECK-NEXT: vmov lr, s21 +; CHECK-NEXT: vmovx.f16 s20, s2 +; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: vmov.16 q5[6], r3 +; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov.16 q6[0], r4 +; CHECK-NEXT: vmov.16 q6[1], r3 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmovx.f16 s16, s12 +; CHECK-NEXT: vmov.16 q6[2], r3 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmovx.f16 s16, s15 +; CHECK-NEXT: vmov.16 q6[3], r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmovx.f16 s14, s14 +; CHECK-NEXT: vmov.16 q6[4], r3 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vins.f16 s14, s0 +; CHECK-NEXT: vmov.16 q6[5], r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmovx.f16 s15, s1 ; CHECK-NEXT: vmov.f32 s27, s23 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: vmov q2[2], q2[0], r5, r12 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, lr -; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r5 -; CHECK-NEXT: vadd.f16 q0, q6, q0 -; CHECK-NEXT: vmovx.f16 s12, s16 -; CHECK-NEXT: vadd.f16 q1, q0, q2 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vins.f16 s15, s3 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r12 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov q1[2], q1[0], r6, r3 +; CHECK-NEXT: vmov r4, s15 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r5 +; CHECK-NEXT: vmov q1[3], q1[1], lr, r4 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vadd.f16 q1, q6, q1 +; CHECK-NEXT: vmovx.f16 s20, s9 +; CHECK-NEXT: vadd.f16 q1, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmovx.f16 s4, s19 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmovx.f16 s24, s17 +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vmov.16 q1[7], r2 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmovx.f16 s12, s18 +; CHECK-NEXT: vmov r2, s17 ; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.16 q3[1], r3 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s20, s1 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov.16 q3[2], r2 ; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovx.f16 s24, s9 ; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmovx.f16 s28, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov r5, s10 +; CHECK-NEXT: vmov r6, s1 +; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov r4, s12 ; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov r2, s17 ; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmovx.f16 s20, s18 +; CHECK-NEXT: vmovx.f16 s20, s16 +; CHECK-NEXT: vins.f16 s20, s18 +; CHECK-NEXT: vmovx.f16 s21, s19 +; CHECK-NEXT: vins.f16 s21, s9 ; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.16 q5[7], r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q6[0], r3 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmovx.f16 s28, s3 -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmovx.f16 s28, s2 -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmovx.f16 s16, s17 -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: vmov.16 q7[5], r3 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: vmov.16 q4[6], r6 -; CHECK-NEXT: vmov r2, s30 -; CHECK-NEXT: vmov.16 q4[7], r3 -; CHECK-NEXT: vmov.f32 s27, s23 -; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmovx.f16 s20, s2 +; CHECK-NEXT: vmov r5, s20 +; CHECK-NEXT: vmov.16 q5[6], r6 +; CHECK-NEXT: vmov.16 q5[7], r5 +; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r6, s24 +; CHECK-NEXT: vmov.16 q6[0], r5 +; CHECK-NEXT: vmov.16 q6[1], r6 +; CHECK-NEXT: vmov r6, s19 ; CHECK-NEXT: vmovx.f16 s16, s8 +; CHECK-NEXT: vmov.16 q6[2], r6 ; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmov.16 q4[0], r6 -; CHECK-NEXT: vmov r6, s8 -; CHECK-NEXT: vmov.16 q4[1], r5 -; CHECK-NEXT: vmov.16 q4[2], r6 -; CHECK-NEXT: vmov r6, s1 -; CHECK-NEXT: vmov.16 q4[3], r6 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: vmovx.f16 s16, s11 +; CHECK-NEXT: vmov.16 q6[3], r6 +; CHECK-NEXT: vmov r6, s10 +; CHECK-NEXT: vmovx.f16 s10, s10 +; CHECK-NEXT: vmov.16 q6[4], r6 ; CHECK-NEXT: vmov r6, s16 +; CHECK-NEXT: vins.f16 s10, s0 +; CHECK-NEXT: vmov.16 q6[5], r6 +; CHECK-NEXT: vmov r6, s10 +; CHECK-NEXT: vmovx.f16 s11, s1 +; CHECK-NEXT: vmov.f32 s27, s23 +; CHECK-NEXT: vins.f16 s11, s3 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 ; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r2 -; CHECK-NEXT: vmov r5, s17 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r6 +; CHECK-NEXT: vmov r5, s11 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r5 ; CHECK-NEXT: vmov r4, s13 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r0 ; CHECK-NEXT: vadd.f16 q1, q6, q1 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %l1 = load <48 x half>, <48 x half>* %src, align 4 Index: llvm/test/CodeGen/Thumb2/mve-vld4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -390,84 +390,64 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrb.u8 q1, [r0] -; CHECK-NEXT: vldrb.u8 q0, [r0, #16] -; CHECK-NEXT: vldrb.u8 q2, [r0, #32] -; CHECK-NEXT: vldrb.u8 q3, [r0, #48] -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q5[5], r2 +; CHECK-NEXT: vldrb.u8 q3, [r0] +; CHECK-NEXT: vldrb.u8 q0, [r0, #32] +; CHECK-NEXT: vldrb.u8 q1, [r0, #48] +; CHECK-NEXT: vldrb.u8 q2, [r0, #16] ; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.f32 s18, s1 ; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r0, q3[7] ; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vins.f16 s18, s3 ; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.f32 s19, s5 ; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vins.f16 s19, s7 ; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.f32 s16, s13 ; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r0, q2[7] ; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vadd.i16 q4, q5, q4 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vins.f16 s16, s15 +; CHECK-NEXT: vmov.f32 s17, s9 ; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vmov.f32 s22, s26 +; CHECK-NEXT: vins.f16 s17, s11 ; CHECK-NEXT: vmov.f32 s23, s27 ; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vadd.i16 q4, q4, q5 +; CHECK-NEXT: vmov.f64 d11, d0 +; CHECK-NEXT: vmov.u16 r0, q3[5] ; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.u16 r0, q2[1] ; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vins.f16 s22, s2 +; CHECK-NEXT: vmov.f32 s23, s4 +; CHECK-NEXT: vins.f16 s23, s6 +; CHECK-NEXT: vmov.f32 s20, s12 +; CHECK-NEXT: vins.f16 s20, s14 +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.f32 s21, s8 +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] ; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.f32 s26, s6 -; CHECK-NEXT: vmov.f32 s27, s7 -; CHECK-NEXT: vadd.i16 q0, q6, q5 +; CHECK-NEXT: vins.f16 s21, s10 +; CHECK-NEXT: vmov.f32 s26, s14 +; CHECK-NEXT: vmov.f32 s27, s15 +; CHECK-NEXT: vadd.i16 q0, q5, q6 ; CHECK-NEXT: vadd.i16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} @@ -1102,23 +1082,18 @@ ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov.f32 s8, s1 ; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.16 q2[1], r0 ; CHECK-NEXT: vadd.f16 q1, q2, q1 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov.16 q2[1], r0 ; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r0, s0 @@ -1142,50 +1117,41 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8} ; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vldrh.u16 q1, [r0] -; CHECK-NEXT: vldrh.u16 q0, [r0, #16] -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vldrh.u16 q1, [r0, #16] ; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vmovx.f16 s12, s5 ; CHECK-NEXT: vmov.16 q2[1], r3 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[2], r0 ; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vmovx.f16 s16, s1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmovx.f16 s16, s3 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vins.f16 s5, s7 +; CHECK-NEXT: vmov.f32 s12, s1 ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vadd.f16 q2, q2, q3 -; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vadd.f16 q2, q3, q2 +; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s6 +; CHECK-NEXT: vmovx.f16 s12, s2 ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmovx.f16 s16, s4 ; CHECK-NEXT: vmov.16 q3[1], r2 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s2 +; CHECK-NEXT: vmovx.f16 s16, s6 ; CHECK-NEXT: vmov.16 q3[2], r0 ; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vadd.f16 q0, q1, q3 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vadd.f16 q0, q0, q3 ; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r0, s0 @@ -1287,114 +1253,92 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: vldrb.u8 q3, [r0, #32] +; CHECK-NEXT: vldrb.u8 q1, [r0, #48] ; CHECK-NEXT: vldrb.u8 q6, [r0] -; CHECK-NEXT: vldrb.u8 q2, [r0, #16] -; CHECK-NEXT: vldrb.u8 q4, [r0, #32] -; CHECK-NEXT: vldrb.u8 q5, [r0, #48] -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmovx.f16 s0, s21 -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmovx.f16 s0, s23 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s25 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s27 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r2, s19 -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: vmov.16 q7[0], r0 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: vmov.16 q7[1], r2 -; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vmov.f64 d9, d6 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmovx.f16 s20, s26 +; CHECK-NEXT: vmovx.f16 s28, s0 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q7[2], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmov.16 q7[3], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmovx.f16 s8, s20 -; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s18, s14 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.f32 s19, s4 +; CHECK-NEXT: vmovx.f16 s20, s24 +; CHECK-NEXT: vins.f16 s19, s6 +; CHECK-NEXT: vmov.f32 s6, s13 +; CHECK-NEXT: vins.f16 s6, s15 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vins.f16 s7, s11 +; CHECK-NEXT: vmov.f32 s16, s24 +; CHECK-NEXT: vmov.f32 s4, s25 +; CHECK-NEXT: vins.f16 s16, s26 +; CHECK-NEXT: vins.f16 s4, s27 +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: vmovx.f16 s28, s15 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.16 q5[2], r0 +; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vins.f16 s17, s2 +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: vmovx.f16 s28, s13 +; CHECK-NEXT: vmov r2, s28 +; CHECK-NEXT: vins.f16 s5, s3 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov.16 q7[4], r2 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmov.16 q7[5], r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s22 -; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmov.16 q7[6], r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s24 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmovx.f16 s8, s27 +; CHECK-NEXT: vmov.16 q7[7], r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s26 -; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmovx.f16 s8, s25 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov.16 q6[1], r2 +; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmov.16 q6[1], r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s8, s3 ; CHECK-NEXT: vmov.16 q6[2], r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s8, s14 ; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s12 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q3[4], r2 +; CHECK-NEXT: vmov.f32 s26, s30 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s2 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.f32 s26, s6 -; CHECK-NEXT: vmov.f32 s30, s2 -; CHECK-NEXT: vmov.f32 s31, s3 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vmov.f32 s27, s7 -; CHECK-NEXT: vadd.f16 q3, q7, q3 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vadd.f16 q0, q0, q6 -; CHECK-NEXT: vadd.f16 q0, q0, q3 +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q5[3], r0 +; CHECK-NEXT: vmov.f32 s27, s31 +; CHECK-NEXT: vmov.f32 s22, s14 +; CHECK-NEXT: vadd.f16 q1, q1, q6 +; CHECK-NEXT: vmov.f32 s23, s15 +; CHECK-NEXT: vadd.f16 q0, q4, q5 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-vldst4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -8,15 +8,15 @@ ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #80 -; CHECK-NEXT: sub sp, #80 +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: mul r12, r3, r2 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: cmp.w r2, r12, lsr #2 ; CHECK-NEXT: beq.w .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 -; CHECK-NEXT: ldr r5, [sp, #160] +; CHECK-NEXT: ldr r5, [sp, #144] ; CHECK-NEXT: and.w r3, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 @@ -24,213 +24,172 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q5, [r0, #32] +; CHECK-NEXT: vldrh.u16 q4, [r0, #32] ; CHECK-NEXT: vldrh.u16 q3, [r0, #48] ; CHECK-NEXT: vldrh.u16 q7, [r0], #64 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovx.f16 s8, s12 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r2, s28 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmovx.f16 s20, s12 +; CHECK-NEXT: vins.f16 s2, s18 ; CHECK-NEXT: vldrh.u16 q6, [r0, #-48] -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov r3, s30 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmul.f16 q0, q1, r5 -; CHECK-NEXT: vmovx.f16 s4, s24 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s30 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f32 s3, s12 +; CHECK-NEXT: vmovx.f16 s12, s15 +; CHECK-NEXT: vins.f16 s3, s14 +; CHECK-NEXT: vmovx.f16 s8, s24 +; CHECK-NEXT: vmov.f32 s0, s28 +; CHECK-NEXT: vins.f16 s0, s30 +; CHECK-NEXT: vmov.f32 s1, s24 +; CHECK-NEXT: vins.f16 s1, s26 +; CHECK-NEXT: vmul.f16 q1, q0, r5 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmovx.f16 s0, s28 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s30 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s22 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s20 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.16 q1[4], r4 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s14 -; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s26 -; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov.16 q0[1], r4 +; CHECK-NEXT: vmovx.f16 s8, s16 +; CHECK-NEXT: vmov.16 q0[2], r2 ; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s8, s18 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.16 q2[5], r4 +; CHECK-NEXT: vmovx.f16 s20, s14 +; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmovx.f16 s20, s26 +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmovx.f16 s8, s13 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vmovx.f16 s8, s25 ; CHECK-NEXT: vmul.f16 q0, q0, r5 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: vmov r3, s29 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vins.f16 s7, s3 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s0, s7 +; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vins.f16 s2, s19 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vins.f16 s3, s15 +; CHECK-NEXT: vmov.f32 s0, s29 +; CHECK-NEXT: vins.f16 s0, s31 +; CHECK-NEXT: vmov.f32 s1, s25 +; CHECK-NEXT: vins.f16 s1, s27 +; CHECK-NEXT: vmul.f16 q5, q0, r5 +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmovx.f16 s0, s31 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s29 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmul.f16 q4, q1, r5 ; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: vmovx.f16 s4, s25 ; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s23 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s21 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q1[4], r4 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s15 -; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s27 -; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmovx.f16 s8, s19 +; CHECK-NEXT: vmov.16 q0[2], r2 ; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmovx.f16 s16, s13 +; CHECK-NEXT: vmov.16 q2[4], r4 +; CHECK-NEXT: vmov.16 q2[5], r2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s12, s27 +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.16 q1[2], r3 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.f32 s3, s11 ; CHECK-NEXT: vmul.f16 q6, q0, r5 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vins.f16 s20, s24 ; CHECK-NEXT: vmovx.f16 s0, s24 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov.f32 s9, s20 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q2[6], r3 +; CHECK-NEXT: vmov.16 q2[7], r2 ; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov.16 q5[1], r3 -; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vmov.16 q3[3], r3 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.16 q4[4], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s25 -; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vins.f16 s21, s25 +; CHECK-NEXT: vmov.16 q4[5], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q7[0], r2 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov.16 q7[1], r3 +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vmovx.f16 s0, s25 +; CHECK-NEXT: vmov.16 q2[6], r2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vins.f16 s5, s13 +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f32 s28, s5 +; CHECK-NEXT: vmovx.f16 s0, s13 ; CHECK-NEXT: vmov.16 q7[4], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s0, s22 ; CHECK-NEXT: vmov.16 q7[5], r2 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q2[3], r3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vins.f16 s22, s26 +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s9, s22 +; CHECK-NEXT: vmovx.f16 s12, s26 ; CHECK-NEXT: vmov.16 q2[6], r2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s12, s6 +; CHECK-NEXT: vins.f16 s6, s2 ; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmovx.f16 s4, s6 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s18 -; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov r2, s19 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmovx.f16 s16, s19 -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmovx.f16 s16, s27 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s16, s19 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s21, s25 -; CHECK-NEXT: vstrh.16 q0, [r1, #32] -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov.f32 s29, s13 -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vstrh.16 q2, [r1, #48] -; CHECK-NEXT: vstrh.16 q5, [r1], #64 -; CHECK-NEXT: vmov.f32 s31, s15 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmovx.f16 s12, s2 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s12, s23 +; CHECK-NEXT: vins.f16 s23, s27 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.f32 s21, s23 +; CHECK-NEXT: vmovx.f16 s12, s27 +; CHECK-NEXT: vmov.16 q5[6], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q3[5], r2 +; CHECK-NEXT: vmov.f32 s5, s9 +; CHECK-NEXT: vmov.f32 s29, s1 +; CHECK-NEXT: vmov.f32 s31, s3 +; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vmov.f32 s1, s21 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s3, s23 +; CHECK-NEXT: vstrh.16 q1, [r1, #32] +; CHECK-NEXT: vmov.f32 s19, s27 +; CHECK-NEXT: vstrh.16 q0, [r1, #48] +; CHECK-NEXT: vstrh.16 q4, [r1], #64 ; CHECK-NEXT: vstrh.16 q7, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: add sp, #80 +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: Index: llvm/test/CodeGen/Thumb2/mve-vmovn.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmovn.ll +++ llvm/test/CodeGen/Thumb2/mve-vmovn.ll @@ -393,45 +393,29 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmovx.f16 s9, s5 +; CHECK-NEXT: vins.f16 s9, s1 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vins.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s10, s6 +; CHECK-NEXT: vins.f16 s10, s2 +; CHECK-NEXT: vmovx.f16 s11, s7 +; CHECK-NEXT: vins.f16 s11, s3 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn16_b2: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: vrev64.16 q2, q1 -; CHECKBE-NEXT: vrev64.16 q3, q0 -; CHECKBE-NEXT: vmov.u16 r0, q2[1] -; CHECKBE-NEXT: vmov.16 q1[0], r0 -; CHECKBE-NEXT: vmov.u16 r0, q3[0] -; CHECKBE-NEXT: vmov.16 q1[1], r0 -; CHECKBE-NEXT: vmov.u16 r0, q2[3] -; CHECKBE-NEXT: vmov.16 q1[2], r0 -; CHECKBE-NEXT: vmov.u16 r0, q3[2] -; CHECKBE-NEXT: vmov.16 q1[3], r0 -; CHECKBE-NEXT: vmov.u16 r0, q2[5] -; CHECKBE-NEXT: vmov.16 q1[4], r0 -; CHECKBE-NEXT: vmov.u16 r0, q3[4] -; CHECKBE-NEXT: vmov.16 q1[5], r0 -; CHECKBE-NEXT: vmov.u16 r0, q2[7] -; CHECKBE-NEXT: vmov.16 q1[6], r0 -; CHECKBE-NEXT: vmov.u16 r0, q3[6] -; CHECKBE-NEXT: vmov.16 q1[7], r0 +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: vmovx.f16 s5, s1 +; CHECKBE-NEXT: vins.f16 s5, s9 +; CHECKBE-NEXT: vmovx.f16 s4, s0 +; CHECKBE-NEXT: vins.f16 s4, s8 +; CHECKBE-NEXT: vmovx.f16 s6, s2 +; CHECKBE-NEXT: vins.f16 s6, s10 +; CHECKBE-NEXT: vmovx.f16 s7, s3 +; CHECKBE-NEXT: vins.f16 s7, s11 ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr entry: @@ -442,46 +426,30 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmovx.f16 s1, s9 +; CHECK-NEXT: vins.f16 s1, s5 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s2, s10 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vmovx.f16 s3, s11 +; CHECK-NEXT: vins.f16 s3, s7 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn16_b3: ; CHECKBE: @ %bb.0: @ %entry ; CHECKBE-NEXT: vrev64.16 q3, q0 +; CHECKBE-NEXT: vrev64.16 q2, q1 +; CHECKBE-NEXT: vmovx.f16 s5, s13 +; CHECKBE-NEXT: vins.f16 s5, s9 +; CHECKBE-NEXT: vmovx.f16 s4, s12 +; CHECKBE-NEXT: vins.f16 s4, s8 +; CHECKBE-NEXT: vmovx.f16 s6, s14 +; CHECKBE-NEXT: vins.f16 s6, s10 +; CHECKBE-NEXT: vmovx.f16 s7, s15 +; CHECKBE-NEXT: vins.f16 s7, s11 ; CHECKBE-NEXT: vrev64.16 q0, q1 -; CHECKBE-NEXT: vmov.u16 r0, q3[1] -; CHECKBE-NEXT: vmov.16 q2[0], r0 -; CHECKBE-NEXT: vmov.u16 r0, q0[0] -; CHECKBE-NEXT: vmov.16 q2[1], r0 -; CHECKBE-NEXT: vmov.u16 r0, q3[3] -; CHECKBE-NEXT: vmov.16 q2[2], r0 -; CHECKBE-NEXT: vmov.u16 r0, q0[2] -; CHECKBE-NEXT: vmov.16 q2[3], r0 -; CHECKBE-NEXT: vmov.u16 r0, q3[5] -; CHECKBE-NEXT: vmov.16 q2[4], r0 -; CHECKBE-NEXT: vmov.u16 r0, q0[4] -; CHECKBE-NEXT: vmov.16 q2[5], r0 -; CHECKBE-NEXT: vmov.u16 r0, q3[7] -; CHECKBE-NEXT: vmov.16 q2[6], r0 -; CHECKBE-NEXT: vmov.u16 r0, q0[6] -; CHECKBE-NEXT: vmov.16 q2[7], r0 -; CHECKBE-NEXT: vrev64.16 q0, q2 ; CHECKBE-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll +++ llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll @@ -314,22 +314,14 @@ define arm_aapcs_vfpcc void @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { ; CHECK-LABEL: vmovn16_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov.16 q2[7], r1 +; CHECK-NEXT: vmovx.f16 s9, s5 +; CHECK-NEXT: vins.f16 s9, s1 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vins.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s10, s6 +; CHECK-NEXT: vins.f16 s10, s2 +; CHECK-NEXT: vmovx.f16 s11, s7 +; CHECK-NEXT: vins.f16 s11, s3 ; CHECK-NEXT: vstrw.32 q2, [r0] ; CHECK-NEXT: bx lr entry: @@ -341,22 +333,14 @@ define arm_aapcs_vfpcc void @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { ; CHECK-LABEL: vmovn16_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] -; CHECK-NEXT: vmov.16 q2[7], r1 +; CHECK-NEXT: vmovx.f16 s9, s1 +; CHECK-NEXT: vins.f16 s9, s5 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s10, s2 +; CHECK-NEXT: vins.f16 s10, s6 +; CHECK-NEXT: vmovx.f16 s11, s3 +; CHECK-NEXT: vins.f16 s11, s7 ; CHECK-NEXT: vstrw.32 q2, [r0] ; CHECK-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll +++ llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll @@ -140,31 +140,23 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_interleaved(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vqdmulh_i16_interleaved: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.f32 s9, s2 ; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vins.f16 s9, s3 +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vins.f16 s8, s1 ; CHECK-NEXT: vmov.16 q2[4], r0 ; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] ; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.f32 s1, s6 ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vins.f16 s1, s7 ; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vins.f16 s0, s5 ; CHECK-NEXT: vmov.16 q0[4], r0 ; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.16 q0[5], r0 @@ -173,18 +165,15 @@ ; CHECK-NEXT: vmov.u16 r0, q1[7] ; CHECK-NEXT: vmov.16 q0[7], r0 ; CHECK-NEXT: vqdmulh.s16 q1, q0, q2 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.f32 s0, s4 ; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vins.f16 s0, s6 +; CHECK-NEXT: vmov.f32 s8, s5 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.u16 r0, q1[5] ; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vins.f16 s8, s7 +; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.16 q0[6], r0 ; CHECK-NEXT: vmov.u16 r0, q1[7] Index: llvm/test/CodeGen/Thumb2/mve-vst2.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -208,40 +208,32 @@ ; CHECK-LABEL: vst2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov.f64 d4, d3 ; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vins.f16 s12, s0 +; CHECK-NEXT: vins.f16 s10, s3 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.16 q2[7], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q3[5], r0 ; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vstrb.8 q2, [r1, #16] +; CHECK-NEXT: vins.f16 s14, s1 ; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vstrb.8 q0, [r1, #16] ; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: @@ -523,19 +515,16 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r2, r0, [r0] ; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vmov.16 q0[2], r2 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: str r2, [r1, #4] ; CHECK-NEXT: bx lr @@ -553,32 +542,28 @@ ; CHECK-LABEL: vst2_v4f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldm.w r0, {r2, r3, r12} -; CHECK-NEXT: vmov.32 q0[0], r12 ; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.16 q1[1], r0 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.32 q1[0], r12 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmovx.f16 s12, s1 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vins.f16 s1, s5 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmovx.f16 s0, s5 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s8, s9 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vstrh.16 q1, [r1] +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vstrh.16 q2, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 @@ -633,46 +618,38 @@ define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vst2_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vins.f16 s2, s6 ; CHECK-NEXT: vmovx.f16 s12, s6 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov.f64 d4, d1 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s2 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmovx.f16 s12, s3 ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vins.f16 s3, s7 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmovx.f16 s12, s7 ; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s4 ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmovx.f16 s12, s4 ; CHECK-NEXT: vstrb.8 q2, [r1, #16] -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov.16 q2[2], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vmovx.f16 s12, s1 ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vins.f16 s1, s5 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmovx.f16 s0, s5 ; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q2[7], r0 Index: llvm/test/CodeGen/Thumb2/mve-vst3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -385,78 +385,74 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vins.f16 s16, s8 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vmov.f32 s19, s5 ; CHECK-NEXT: vdup.32 q5, r2 -; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vmov.f32 s17, s4 ; CHECK-NEXT: vmov.u16 r2, q5[2] -; CHECK-NEXT: vmov.u16 r0, q3[3] ; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.u16 r0, q4[3] ; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.u16 r0, q4[4] ; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov.u16 r0, q5[5] ; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] +; CHECK-NEXT: vmov.u16 r0, q2[5] ; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.u16 r0, q3[5] ; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] +; CHECK-NEXT: vmov.u16 r0, q2[6] ; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.f32 s13, s25 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.f32 s17, s25 ; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov.f32 s18, s26 ; CHECK-NEXT: vmov.16 q5[7], r0 ; CHECK-NEXT: vdup.32 q6, r2 -; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmov.f32 s22, s15 ; CHECK-NEXT: vmov.u16 r2, q6[2] -; CHECK-NEXT: vmov.f32 s22, s7 -; CHECK-NEXT: vrev32.16 q4, q4 -; CHECK-NEXT: vmov.16 q7[2], r2 ; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.u16 r2, q4[2] -; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.16 q7[2], r2 +; CHECK-NEXT: vmov.f32 s1, s13 ; CHECK-NEXT: vmov.16 q7[3], r0 ; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vrev32.16 q2, q2 ; CHECK-NEXT: vmov.16 q7[4], r0 ; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: vmov.16 q7[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.f32 s21, s29 -; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov.16 q2[4], r2 ; CHECK-NEXT: vmov.f32 s22, s30 -; CHECK-NEXT: vmov.f32 s2, s14 +; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.f32 s2, s10 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -482,181 +478,173 @@ ; CHECK-NEXT: .pad #160 ; CHECK-NEXT: sub sp, #160 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vldrw.u32 q7, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov.f64 d4, d0 +; CHECK-NEXT: vmov.u16 r2, q7[1] ; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s8, s28 ; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.16 q2[6], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vins.f16 s11, s29 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov.f32 s17, s0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.u16 r2, q4[3] +; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[2], r3 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov.16 q3[2], r3 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov.u16 r2, q4[4] +; CHECK-NEXT: vmov.16 q3[4], r2 ; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q3[6] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s22, s3 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.u16 r2, q5[3] -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.16 q3[5], r2 +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d12, d0 +; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s24, s4 +; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmov.f32 s27, s1 +; CHECK-NEXT: vins.f16 s27, s5 +; CHECK-NEXT: vmov.f32 s25, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vmov.u16 r2, q6[3] +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov q5, q0 ; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q5, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q6[4] +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov.16 q3[4], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q5[5] ; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.u16 r0, q5[7] ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.16 q4[7], r0 ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.f32 s25, s8 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u16 r0, q6[3] -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q6[4] -; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov.f32 s18, s23 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov.u16 r0, q4[4] +; CHECK-NEXT: vmov.16 q1[4], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov.16 q1[5], r0 ; CHECK-NEXT: vmov.u16 r0, q7[5] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[0], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[1], r0 ; CHECK-NEXT: vmov.u16 r0, q7[6] -; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.16 q1[3], r0 ; CHECK-NEXT: vmov.u16 r0, q7[7] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.f32 s2, s19 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vdup.32 q7, r2 -; CHECK-NEXT: vrev32.16 q3, q3 -; CHECK-NEXT: vmov.u16 r2, q7[2] -; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q7[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vldrw.u32 q2, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vrev32.16 q7, q7 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vdup.32 q0, r2 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.16 q5[2], r2 +; CHECK-NEXT: vmov.16 q5[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vrev32.16 q0, q7 ; CHECK-NEXT: vmov.16 q7[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q7[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov.f32 s25, s5 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.f32 s26, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #80] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s21, s5 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vldrw.u32 q2, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q3, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vrev32.16 q0, q0 ; CHECK-NEXT: vmov.16 q7[7], r0 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.f32 s5, s21 +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s9 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s26, s10 +; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s6, s22 +; CHECK-NEXT: vstrw.32 q6, [r1] +; CHECK-NEXT: vmov.f32 s13, s9 +; CHECK-NEXT: vstrw.32 q1, [r1, #80] +; CHECK-NEXT: vmov.f32 s14, s10 +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [r1, #48] +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vldrw.u32 q2, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vldrw.u32 q2, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s29, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s30, s6 ; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s30, s18 -; CHECK-NEXT: vstrw.32 q6, [r1] ; CHECK-NEXT: vmov.u16 r2, q7[3] -; CHECK-NEXT: vmov.f32 s13, s5 ; CHECK-NEXT: vmov.16 q0[3], r2 ; CHECK-NEXT: vmov.u16 r2, q7[4] ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q0[5], r0 ; CHECK-NEXT: vmov.f32 s29, s1 ; CHECK-NEXT: vmov.f32 s30, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s6 -; CHECK-NEXT: vstrw.32 q7, [r1, #16] -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q3[3] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.f32 s13, s1 -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vstrw.32 q3, [r1, #64] +; CHECK-NEXT: vstrw.32 q7, [r1, #64] ; CHECK-NEXT: add sp, #160 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -1417,27 +1405,25 @@ define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) { ; CHECK-LABEL: vst3_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldmia r0, {s4, s5} -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vldmia r0, {s8, s9} +; CHECK-NEXT: ldr r2, [r0, #8] +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s8, s9 +; CHECK-NEXT: vmov.16 q1[2], r2 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmov.16 q1[4], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: stm r1!, {r0, r2, r3} ; CHECK-NEXT: bx lr entry: @@ -1462,38 +1448,34 @@ ; CHECK-NEXT: ldrd lr, r12, [r0] ; CHECK-NEXT: ldrd r3, r2, [r0, #8] ; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r2 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov.16 q2[2], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s2 +; CHECK-NEXT: vmovx.f16 s12, s6 ; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vmov.16 q2[4], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vmovx.f16 s12, s5 +; CHECK-NEXT: vins.f16 s5, s7 ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmovx.f16 s4, s7 ; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vmov.16 q2[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q2[3], r0 @@ -1522,39 +1504,36 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s24, s7 +; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s0, s4 ; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q0[4], r2 ; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmovx.f16 s16, s18 ; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vins.f16 s3, s5 ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov.16 q3[2], r0 ; CHECK-NEXT: vmov.f32 s1, s8 ; CHECK-NEXT: vmov.16 q3[3], r2 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmovx.f16 s16, s18 +; CHECK-NEXT: vmovx.f16 s4, s10 ; CHECK-NEXT: vmov.16 q3[4], r0 ; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmovx.f16 s16, s22 ; CHECK-NEXT: vmov.16 q3[5], r0 ; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vmovx.f16 s24, s7 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: vmov.16 q4[1], r0 @@ -1573,7 +1552,6 @@ ; CHECK-NEXT: vmov.16 q6[2], r2 ; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmovx.f16 s28, s30 -; CHECK-NEXT: vmovx.f16 s4, s10 ; CHECK-NEXT: vmov.f32 s1, s13 ; CHECK-NEXT: vmov.f32 s2, s14 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -1634,200 +1612,191 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #144 ; CHECK-NEXT: sub sp, #144 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vldrw.u32 q7, [r0, #80] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q6, [r0, #32] +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vmov r3, s28 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vstrw.32 q5, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d2, d0 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s4, s20 ; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q3[2], r3 -; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vmov.16 q3[5], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vins.f16 s7, s21 +; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vmov.16 q2[2], r3 -; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.16 q7[0], r3 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.f64 d2, d6 ; CHECK-NEXT: vmov.16 q2[4], r2 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s24 ; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q7[1], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov.16 q7[6], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vmov.16 q7[7], r2 +; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vins.f16 s4, s24 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q1[4], r2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov.f32 s7, s13 ; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.f32 s29, s20 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov r0, s30 -; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vins.f16 s7, s25 +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q4[4], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vmov.16 q4[5], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s15 -; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmovx.f16 s0, s31 +; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov.f32 s22, s31 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov r2, s7 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vdup.32 q0, r2 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov.16 q4[4], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmovx.f16 s0, s21 -; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s26 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s22 -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmovx.f16 s0, s27 +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov r0, s27 +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s11 ; CHECK-NEXT: vmov.16 q4[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.f32 s9, s25 -; CHECK-NEXT: vmov.f32 s17, s13 -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov.f32 s18, s11 ; CHECK-NEXT: vmovx.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vrev32.16 q0, q1 +; CHECK-NEXT: vdup.32 q0, r2 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.f32 s10, s26 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, #80] +; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmovx.f16 s0, s13 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov r0, s29 +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.16 q5[6], r0 +; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov.16 q6[1], r2 +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vmov.16 q6[6], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.f32 s25, s29 +; CHECK-NEXT: vmov.f32 s26, s6 +; CHECK-NEXT: vmovx.f16 s0, s25 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vrev32.16 q3, q0 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmovx.f16 s12, s14 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s4, s14 +; CHECK-NEXT: vrev32.16 q0, q0 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.16 q7[2], r2 +; CHECK-NEXT: vmov.16 q7[3], r0 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov.16 q7[4], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q7[5], r0 +; CHECK-NEXT: vmovx.f16 s0, s13 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s25, s29 +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vmov.f32 s2, s14 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q3, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.f32 s13, s25 -; CHECK-NEXT: vmov.f32 s14, s26 -; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s25, s5 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.f32 s21, s1 -; CHECK-NEXT: vmov.f32 s26, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s22, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vstrw.32 q6, [r1, #32] -; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vstrw.32 q5, [r1, #16] -; CHECK-NEXT: vmov.f32 s30, s6 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vstrw.32 q7, [r1, #48] -; CHECK-NEXT: vstrw.32 q4, [r1, #64] +; CHECK-NEXT: vmov.f32 s26, s30 +; CHECK-NEXT: vrev32.16 q2, q1 +; CHECK-NEXT: vstrw.32 q6, [r1, #64] +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmovx.f16 s8, s10 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmov.f32 s13, s9 +; CHECK-NEXT: vmov.f32 s14, s10 +; CHECK-NEXT: vldrw.u32 q2, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s9, s13 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vldrw.u32 q2, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s21, s13 +; CHECK-NEXT: vmov.f32 s22, s14 +; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q5, [r1, #80] +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: vmov.f32 s17, s13 +; CHECK-NEXT: vmov.f32 s18, s14 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] ; CHECK-NEXT: add sp, #144 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr Index: llvm/test/CodeGen/Thumb2/mve-vst4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -422,76 +422,64 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.f64 d0, d8 +; CHECK-NEXT: vmov.u16 r0, q4[1] +; CHECK-NEXT: vmov.f32 s21, s5 +; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vins.f16 s21, s21 ; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] +; CHECK-NEXT: vmov.f32 s12, s17 ; CHECK-NEXT: vmov.16 q5[6], r0 +; CHECK-NEXT: vins.f16 s12, s9 ; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.f32 s25, s4 +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vins.f16 s25, s25 +; CHECK-NEXT: vmov.16 q3[5], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.f32 s17, s21 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vstrb.8 q4, [r1] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] ; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.f32 s13, s21 ; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.f32 s15, s23 +; CHECK-NEXT: vmov.f32 s20, s19 +; CHECK-NEXT: vmov.u16 r0, q4[7] +; CHECK-NEXT: vmov.f32 s1, s25 +; CHECK-NEXT: vins.f16 s20, s11 +; CHECK-NEXT: vmov.f32 s3, s27 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.f32 s25, s7 ; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vins.f16 s25, s25 +; CHECK-NEXT: vstrb.8 q3, [r1, #16] +; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] ; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.u16 r0, q4[5] ; CHECK-NEXT: vmov.f32 s21, s25 ; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.f64 d12, d9 ; CHECK-NEXT: vstrb.8 q5, [r1, #48] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.f32 s17, s6 +; CHECK-NEXT: vins.f16 s24, s10 ; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vins.f16 s17, s17 ; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] ; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.f32 s25, s9 -; CHECK-NEXT: vmov.f32 s27, s11 +; CHECK-NEXT: vmov.f32 s25, s17 +; CHECK-NEXT: vmov.f32 s27, s19 ; CHECK-NEXT: vstrb.8 q6, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr @@ -1056,30 +1044,26 @@ define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vst4_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldmia r0, {s4, s5} -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vldmia r0, {s0, s1} ; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vdup.32 q1, r0 ; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: vdup.32 q2, r0 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s9 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vstrh.16 q2, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 @@ -1100,58 +1084,50 @@ define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vst4_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: ldrd lr, r12, [r0] ; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r0 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov q0[2], q0[0], lr, r3 +; CHECK-NEXT: ldrd r3, r0, [r0, #16] +; CHECK-NEXT: vmov q0[3], q0[1], r12, r2 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r0 +; CHECK-NEXT: vmov.f32 s8, s1 ; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s1 +; CHECK-NEXT: vins.f16 s5, s7 +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmovx.f16 s14, s3 +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov r0, s14 ; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vmovx.f16 s12, s7 ; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vstrh.16 q2, [r1, #16] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s4, s6 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.16 q3[4], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vstrh.16 q2, [r1] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vstrh.16 q3, [r1] +; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 %l1 = load <4 x half>, <4 x half>* %s1, align 4 @@ -1238,101 +1214,90 @@ define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) { ; CHECK-LABEL: vst4_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .vsave {d12, d13, d14, d15} +; CHECK-NEXT: vpush {d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 ; CHECK-NEXT: vldrw.u32 q7, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vmov r3, s30 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vmovx.f16 s0, s30 -; CHECK-NEXT: vmov.16 q1[1], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s21 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vins.f16 s17, s17 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r2, s29 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vins.f16 s30, s14 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.f64 d4, d15 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmovx.f16 s0, s29 -; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vins.f16 s29, s13 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmovx.f16 s0, s13 +; CHECK-NEXT: vmov.f32 s4, s29 +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov.16 q1[4], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmovx.f16 s0, s16 +; CHECK-NEXT: vins.f16 s16, s16 +; CHECK-NEXT: vmov.16 q1[5], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.f32 s1, s16 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: vmovx.f16 s0, s28 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vins.f16 s28, s12 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov q6, q7 +; CHECK-NEXT: vmovx.f16 s0, s12 ; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vins.f16 s19, s19 ; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmovx.f16 s0, s23 -; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmovx.f16 s28, s31 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmovx.f16 s28, s7 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmovx.f16 s20, s22 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.16 q7[2], r0 -; CHECK-NEXT: vmov.f32 s25, s17 -; CHECK-NEXT: vmov.16 q7[3], r0 +; CHECK-NEXT: vmov.f32 s1, s19 +; CHECK-NEXT: vmovx.f16 s20, s31 +; CHECK-NEXT: vins.f16 s31, s15 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.16 q0[7], r0 ; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmovx.f16 s20, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q7[7], r0 +; CHECK-NEXT: vmov.f32 s28, s31 +; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmov.16 q7[4], r0 ; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vmov.f32 s13, s21 -; CHECK-NEXT: vmov.f32 s5, s29 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vmov.f32 s27, s19 -; CHECK-NEXT: vstrb.8 q3, [r1, #16] -; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vmovx.f16 s20, s18 +; CHECK-NEXT: vins.f16 s18, s18 +; CHECK-NEXT: vmov.16 q7[5], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.f32 s17, s18 +; CHECK-NEXT: vmovx.f16 s12, s14 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.f32 s29, s1 +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov.f32 s9, s17 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s31, s3 +; CHECK-NEXT: vstrb.8 q1, [r1, #16] +; CHECK-NEXT: vmov.f32 s25, s13 +; CHECK-NEXT: vstrb.8 q7, [r1, #48] +; CHECK-NEXT: vmov.f32 s27, s15 +; CHECK-NEXT: vmov.f32 s11, s19 ; CHECK-NEXT: vstrb.8 q6, [r1] -; CHECK-NEXT: vmov.f32 s7, s31 -; CHECK-NEXT: vstrb.8 q0, [r1, #48] -; CHECK-NEXT: vstrb.8 q1, [r1, #32] -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vstrb.8 q2, [r1, #32] +; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vpop {d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0