Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -1851,6 +1851,13 @@ (i32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), rGPR)>; def : Pat<(insertelt (v4i32 MQPR:$src1), rGPR:$src2, imm:$lane), (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$src2, imm:$lane)>; + // This tries to copy from one lane to another, without going via GPR regs + def : Pat<(insertelt (v4i32 MQPR:$src1), (extractelt (v4i32 MQPR:$src2), imm:$extlane), imm:$inslane), + (v4i32 (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4i32 MQPR:$src1), MQPR)), + (f32 (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4i32 MQPR:$src2), MQPR)), + (SSubReg_f32_reg imm:$extlane))), + (SSubReg_f32_reg imm:$inslane)), + MQPR))>; def : Pat<(vector_insert (v16i8 MQPR:$src1), rGPR:$src2, imm:$lane), (MVE_VMOV_to_lane_8 MQPR:$src1, rGPR:$src2, imm:$lane)>; Index: llvm/test/CodeGen/Thumb2/mve-shuffle.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -62,8 +62,7 @@ define arm_aapcs_vfpcc <4 x i32> @oneoff11_i32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: oneoff11_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> @@ -73,8 +72,7 @@ define arm_aapcs_vfpcc <4 x i32> @oneoff12_i32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: oneoff12_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.f32 s0, s4 ; CHECK-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> @@ -84,8 +82,7 @@ define arm_aapcs_vfpcc <4 x i32> @oneoff21_i32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: oneoff21_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -97,8 +94,7 @@ ; CHECK-LABEL: oneoff22_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.f32 s2, s0 ; CHECK-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -55,7 +55,7 @@ ; CHECK-LABEL: add_v2i32_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: vmov.32 q1[1], r1 @@ -889,7 +889,7 @@ ; CHECK-LABEL: add_v2i32_v2i64_acc_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: vmov.32 q1[1], r2 Index: llvm/test/CodeGen/Thumb2/mve-vld3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -36,25 +36,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f64 d1, d2 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s0, s14 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s12, s13 -; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vmov.f32 s14, s11 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vadd.i32 q1, q4, q3 -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f64 d3, d0 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vadd.i32 q0, q2, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -74,44 +71,39 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vmov.f64 d1, d2 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s0, s14 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s12, s13 -; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vmov.f32 s14, s11 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.32 q3[3], r2 -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.32 q4[3], r2 -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vadd.i32 q1, q4, q3 +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vmov.f64 d3, d0 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vadd.i32 q0, q2, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f64 d3, d4 +; CHECK-NEXT: vmov.f64 d5, d2 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.f32 s20, s13 +; CHECK-NEXT: vmov.f32 s8, s14 ; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s4, s14 ; CHECK-NEXT: vmov.f32 s13, s15 ; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.32 q5[3], r0 ; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.f32 s5, s17 -; CHECK-NEXT: vadd.i32 q2, q3, q5 -; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov.f32 s9, s17 +; CHECK-NEXT: vadd.i32 q1, q3, q5 +; CHECK-NEXT: vadd.i32 q1, q1, q2 ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -131,83 +123,73 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vmov.f64 d1, d2 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s0, s14 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s12, s13 -; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vmov.f32 s14, s11 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.32 q3[3], r2 -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.32 q4[3], r2 -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vadd.i32 q1, q4, q3 -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vmov.f64 d3, d0 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vadd.i32 q0, q2, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vmov.f64 d3, d4 +; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s8, s14 ; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s4, s14 ; CHECK-NEXT: vmov.f32 s13, s15 ; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov.32 q5[3], r2 ; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.32 q3[3], r2 -; CHECK-NEXT: vmov.f32 s5, s17 -; CHECK-NEXT: vadd.i32 q2, q3, q5 -; CHECK-NEXT: vldrw.u32 q3, [r0, #176] -; CHECK-NEXT: vldrw.u32 q5, [r0, #144] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vmov.f64 d5, d6 -; CHECK-NEXT: vldrw.u32 q4, [r0, #160] -; CHECK-NEXT: vmov.f32 s11, s15 -; CHECK-NEXT: vmov.f64 d12, d10 -; CHECK-NEXT: vmov.f32 s8, s22 -; CHECK-NEXT: vmov.f32 s25, s23 -; CHECK-NEXT: vmov.f32 s20, s21 -; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.32 q5[3], r2 -; CHECK-NEXT: vmov.f32 s26, s18 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.32 q6[3], r2 +; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: vmov.f32 s15, s5 ; CHECK-NEXT: vmov.f32 s9, s17 -; CHECK-NEXT: vadd.i32 q3, q6, q5 +; CHECK-NEXT: vadd.i32 q1, q3, q5 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #176] +; CHECK-NEXT: vldrw.u32 q4, [r0, #144] +; CHECK-NEXT: vldrw.u32 q5, [r0, #160] +; CHECK-NEXT: vmov.f64 d7, d4 +; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vmov.f32 s24, s17 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s25, s20 +; CHECK-NEXT: vmov.f32 s17, s19 +; CHECK-NEXT: vmov.f32 s26, s23 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.f32 s27, s10 +; CHECK-NEXT: vmov.f32 s19, s9 +; CHECK-NEXT: vmov.f32 s13, s21 +; CHECK-NEXT: vadd.i32 q2, q4, q6 +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vldrw.u32 q3, [r0, #128] ; CHECK-NEXT: vldrw.u32 q5, [r0, #96] -; CHECK-NEXT: vldrw.u32 q4, [r0, #128] ; CHECK-NEXT: vldrw.u32 q6, [r0, #112] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov.f32 s28, s21 -; CHECK-NEXT: vmov.f64 d7, d8 +; CHECK-NEXT: vmov.f64 d9, d6 ; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.f32 s19, s15 +; CHECK-NEXT: vmov.f32 s28, s21 +; CHECK-NEXT: vmov.f32 s16, s22 ; CHECK-NEXT: vmov.f32 s29, s24 -; CHECK-NEXT: vmov.f32 s12, s22 ; CHECK-NEXT: vmov.f32 s21, s23 ; CHECK-NEXT: vmov.f32 s30, s27 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.32 q7[3], r0 ; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmov.f32 s13, s25 -; CHECK-NEXT: vadd.i32 q4, q5, q7 -; CHECK-NEXT: vadd.i32 q3, q4, q3 +; CHECK-NEXT: vmov.f32 s31, s14 +; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vadd.i32 q3, q5, q7 +; CHECK-NEXT: vadd.i32 q3, q3, q4 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr Index: llvm/test/CodeGen/Thumb2/mve-vst3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -45,25 +45,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s1 ; CHECK-NEXT: vmov.f64 d8, d6 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.f32 s8, s5 ; CHECK-NEXT: vmov.f32 s19, s13 -; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.f32 s18, s0 ; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.f32 s1, s15 ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.f32 s11, s6 +; CHECK-NEXT: vmov.f32 s1, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vstrw.32 q2, [r1, #16] ; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -89,47 +86,41 @@ ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] ; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.f64 d4, d8 -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vmov.f32 s9, s20 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vmov.f64 d10, d8 +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: vmov.f32 s21, s28 ; CHECK-NEXT: vmov.f64 d14, d12 +; CHECK-NEXT: vmov.f64 d4, d1 ; CHECK-NEXT: vmov.f32 s29, s12 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: vmov.f32 s9, s27 ; CHECK-NEXT: vmov.f32 s31, s25 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.f32 s5, s27 -; CHECK-NEXT: vmov.32 q7[2], r0 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vstrw.32 q7, [r1, #48] +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s30, s0 ; CHECK-NEXT: vmov.f32 s0, s13 +; CHECK-NEXT: vstrw.32 q7, [r1, #48] ; CHECK-NEXT: vmov.f32 s3, s14 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.f32 s12, s25 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: vmov.f32 s13, s21 +; CHECK-NEXT: vmov.f32 s2, s26 +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s15 ; CHECK-NEXT: vstrw.32 q0, [r1, #64] -; CHECK-NEXT: vmov.f32 s20, s22 -; CHECK-NEXT: vstrw.32 q1, [r1, #80] +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vstrw.32 q2, [r1, #80] +; CHECK-NEXT: vmov.f32 s12, s25 +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vstrw.32 q5, [r1] ; CHECK-NEXT: vmov.f32 s15, s26 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.f32 s21, s19 -; CHECK-NEXT: vmov r0, s27 +; CHECK-NEXT: vmov.f32 s5, s19 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s6, s27 ; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -154,112 +145,104 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #160 ; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q0, [r0, #144] -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldrw.u32 q6, [r0, #128] -; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] -; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f32 s13, s25 -; CHECK-NEXT: vldrw.u32 q4, [r0, #176] -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r0, #160] -; CHECK-NEXT: vmov.f32 s15, s6 -; CHECK-NEXT: vldrw.u32 q7, [r0, #112] -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vmov.f64 d6, d13 -; CHECK-NEXT: vmov.f32 s13, s23 -; CHECK-NEXT: vmov.f32 s15, s27 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov r0, s31 -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmov.f64 d8, d7 -; CHECK-NEXT: vmov.f32 s17, s3 -; CHECK-NEXT: vmov.f32 s19, s15 -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d8, d10 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f64 d2, d0 -; CHECK-NEXT: vmov.f32 s19, s21 -; CHECK-NEXT: vmov.f32 s5, s28 -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vldrw.u32 q7, [r0, #96] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #80] +; CHECK-NEXT: vmov.f32 s16, s1 +; CHECK-NEXT: vldrw.u32 q3, [r0, #160] +; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #48] +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vstrw.32 q3, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vldrw.u32 q5, [r0, #144] +; CHECK-NEXT: vldrw.u32 q1, [r0, #176] +; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f64 d8, d5 +; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s17, s27 +; CHECK-NEXT: vmov.f32 s19, s11 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vmov.f64 d8, d3 +; CHECK-NEXT: vmov.f32 s17, s31 +; CHECK-NEXT: vmov.f32 s19, s7 +; CHECK-NEXT: vmov.f32 s18, s15 ; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s12, s29 -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.f64 d8, d12 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s19, s25 +; CHECK-NEXT: vmov.f32 s18, s8 +; CHECK-NEXT: vmov q2, q7 +; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s1, s12 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s7, s14 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s15, s30 -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s19, s6 ; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s24, s9 -; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d6, d1 +; CHECK-NEXT: vmov.f64 d12, d11 ; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vmov.f32 s25, s1 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s27, s10 -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.32 q6[2], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.f64 d10, d3 -; CHECK-NEXT: vstrw.32 q6, [r1, #112] -; CHECK-NEXT: vstrw.32 q3, [r1, #128] -; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s11, s3 ; CHECK-NEXT: vmov q0, q7 -; CHECK-NEXT: vmov.f32 s29, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s25, s7 +; CHECK-NEXT: vstrw.32 q4, [r1, #112] +; CHECK-NEXT: vmov.f32 s27, s23 +; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vldrw.u32 q3, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s29, s20 +; CHECK-NEXT: vmov q5, q1 ; CHECK-NEXT: vmov.f32 s31, s1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s23, s7 -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vmov.32 q7[2], r0 -; CHECK-NEXT: vstrw.32 q5, [r1, #80] +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vstrw.32 q2, [r1, #128] +; CHECK-NEXT: vmov.f32 s30, s0 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f64 d0, d2 ; CHECK-NEXT: vstrw.32 q7, [r1, #96] -; CHECK-NEXT: vmov.f32 s1, s16 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.f32 s1, s12 +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s12 ; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s4, s17 -; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vstrw.32 q0, [r1, #144] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmov.f32 s15, s6 ; CHECK-NEXT: vstrw.32 q0, [r1, #160] -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vstrw.32 q0, [r1, #176] ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [r1, #64] +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vstrw.32 q0, [r1, #176] +; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [r1, #64] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #160 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -321,14 +304,9 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrh.u32 q2, [r0, #16] ; CHECK-NEXT: vldrh.u32 q1, [r0] ; CHECK-NEXT: vldrh.u32 q3, [r0, #8] -; CHECK-NEXT: vmov.f64 d0, d5 -; CHECK-NEXT: vmov.f32 s1, s7 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vldrh.u32 q2, [r0, #16] ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov.16 q4[0], r0 ; CHECK-NEXT: vmov r0, s12 @@ -338,12 +316,16 @@ ; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov.16 q4[3], r0 ; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.f64 d0, d5 ; CHECK-NEXT: vmov.16 q4[4], r0 ; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.f32 s1, s7 ; CHECK-NEXT: vmov.16 q4[5], r0 ; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.f32 s3, s11 ; CHECK-NEXT: vmov.16 q4[6], r0 ; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.f32 s2, s15 ; CHECK-NEXT: vmov.16 q4[7], r0 ; CHECK-NEXT: vstrh.32 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q4, [r1]