diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13090,6 +13090,12 @@ return Load; } + // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n) + if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa(N0->getOperand(1))) + return DCI.DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0), + N0->getOperand(1)); + return SDValue(); } @@ -13840,8 +13846,21 @@ /// PerformVDUPLANECombine - Target-specific dag combine xforms for /// ARMISD::VDUPLANE. static SDValue PerformVDUPLANECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { SDValue Op = N->getOperand(0); + EVT VT = N->getValueType(0); + + // On MVE, we just convert the VDUPLANE to a VDUP with an extract. + if (Subtarget->hasMVEIntegerOps()) { + EVT ExtractVT = VT.getVectorElementType(); + // We need to ensure we are creating a legal type. + if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT)) + ExtractVT = MVT::i32; + SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT, + N->getOperand(0), N->getOperand(1)); + return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract); + } // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. @@ -13862,7 +13881,6 @@ unsigned EltBits; if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) EltSize = 8; - EVT VT = N->getValueType(0); if (EltSize > VT.getScalarSizeInBits()) return SDValue(); @@ -15343,7 +15361,7 @@ case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); - case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); + case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget); case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -264,7 +264,7 @@ def ARMvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; def ARMvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>; -def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, +def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisVT<2, i32>]>; def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -1690,10 +1690,14 @@ (MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlanes (v8f16 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlaneu (v8f16 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>; def : Pat<(v16i8 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; @@ -2227,25 +2231,11 @@ def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))), (MVE_VDUP32 rGPR:$elem)>; - def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)), - (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; - // For the 16-bit and 8-bit vduplanes we don't care about the signedness - // of the lane move operation as we only want the lowest 8/16 bits anyway. - def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)), - (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; - def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)), - (MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>; - def : Pat<(v8f16 (ARMvdup (i32 rGPR:$elem))), (MVE_VDUP16 rGPR:$elem)>; def : Pat<(v4f32 (ARMvdup (i32 rGPR:$elem))), (MVE_VDUP32 rGPR:$elem)>; - def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)), - (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; - def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)), - (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; - // Match a vselect with an ARMvdup as a predicated MVE_VDUP def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 (ARMvdup (i32 rGPR:$elem))), diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -243,9 +243,8 @@ ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vmov.32 r0, q0[1] -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vadd.f32 q0, q0, r0 ; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB1_4: @@ -513,9 +512,8 @@ ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vmov.32 r0, q0[1] -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vadd.f32 q0, q0, r0 ; CHECK-NEXT: b .LBB2_23 ; CHECK-NEXT: .LBB2_22: ; CHECK-NEXT: vldr s0, .LCPI2_0 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll @@ -148,7 +148,7 @@ ; CHECK-NEXT: vcmp.i32 eq, q0, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: vmov.32 r0, q0[0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vdup.32 q0, r0 ; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: vcmp.i32 ne, q0, zr diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll --- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -162,7 +162,7 @@ define arm_aapcs_vfpcc <4 x i32> @vduplane_i32(<4 x i32> %src) { ; CHECK-LABEL: vduplane_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.32 r0, q0[3] +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vdup.32 q0, r0 ; CHECK-NEXT: bx lr entry: @@ -206,7 +206,7 @@ define arm_aapcs_vfpcc <4 x float> @vduplane_f32(<4 x float> %src) { ; CHECK-LABEL: vduplane_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.32 r0, q0[3] +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vdup.32 q0, r0 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll @@ -216,7 +216,7 @@ define arm_aapcs_vfpcc float @fmin_v2f32_nofast(<2 x float> %x) { ; CHECK-FP-LABEL: fmin_v2f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 ; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 @@ -224,11 +224,9 @@ ; ; CHECK-NOFP-LABEL: fmin_v2f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov.32 r0, q0[1] -; CHECK-NOFP-NEXT: vdup.32 q1, r0 -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 +; CHECK-NOFP-NEXT: vcmp.f32 s1, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) @@ -241,7 +239,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 ; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 @@ -273,7 +271,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 ; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 @@ -311,7 +309,7 @@ define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmin_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] @@ -322,7 +320,7 @@ ; ; CHECK-NOFP-LABEL: fmin_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov.32 r1, q0[1] +; CHECK-NOFP-NEXT: vmov r1, s1 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 ; CHECK-NOFP-NEXT: vdup.32 q1, r1 ; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 @@ -348,7 +346,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] @@ -400,7 +398,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] @@ -759,7 +757,7 @@ define arm_aapcs_vfpcc float @fmin_v2f32_acc_nofast(<2 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v2f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q2, r0 ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 ; CHECK-FP-NEXT: vcmp.f32 s0, s4 @@ -769,11 +767,9 @@ ; ; CHECK-NOFP-LABEL: fmin_v2f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov.32 r0, q0[1] -; CHECK-NOFP-NEXT: vdup.32 q2, r0 -; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 +; CHECK-NOFP-NEXT: vcmp.f32 s1, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -791,7 +787,7 @@ ; CHECK-FP-NEXT: vmov.f64 d4, d1 ; CHECK-FP-NEXT: vmov.f32 s9, s3 ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q2, r0 ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 ; CHECK-FP-NEXT: vcmp.f32 s0, s4 @@ -830,7 +826,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 ; CHECK-FP-NEXT: vcmp.f32 s0, s8 @@ -875,7 +871,7 @@ define arm_aapcs_vfpcc void @fmin_v4f16_acc_nofast(<4 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] @@ -890,7 +886,7 @@ ; ; CHECK-NOFP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov.32 r1, q0[1] +; CHECK-NOFP-NEXT: vmov r1, s1 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 ; CHECK-NOFP-NEXT: vdup.32 q1, r1 ; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 @@ -924,7 +920,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] @@ -988,7 +984,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] @@ -1335,7 +1331,7 @@ define arm_aapcs_vfpcc float @fmax_v2f32_nofast(<2 x float> %x) { ; CHECK-FP-LABEL: fmax_v2f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 ; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 @@ -1343,11 +1339,9 @@ ; ; CHECK-NOFP-LABEL: fmax_v2f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov.32 r0, q0[1] -; CHECK-NOFP-NEXT: vdup.32 q1, r0 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 +; CHECK-NOFP-NEXT: vcmp.f32 s0, s1 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) @@ -1360,7 +1354,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 ; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 @@ -1392,7 +1386,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 ; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 @@ -1430,7 +1424,7 @@ define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmax_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] @@ -1441,7 +1435,7 @@ ; ; CHECK-NOFP-LABEL: fmax_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov.32 r1, q0[1] +; CHECK-NOFP-NEXT: vmov r1, s1 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 ; CHECK-NOFP-NEXT: vdup.32 q1, r1 ; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 @@ -1467,7 +1461,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] @@ -1519,7 +1513,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] @@ -1878,7 +1872,7 @@ define arm_aapcs_vfpcc float @fmax_v2f32_acc_nofast(<2 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v2f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q2, r0 ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 ; CHECK-FP-NEXT: vcmp.f32 s4, s0 @@ -1888,11 +1882,9 @@ ; ; CHECK-NOFP-LABEL: fmax_v2f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov.32 r0, q0[1] -; CHECK-NOFP-NEXT: vdup.32 q2, r0 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 +; CHECK-NOFP-NEXT: vcmp.f32 s0, s1 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 ; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -1910,7 +1902,7 @@ ; CHECK-FP-NEXT: vmov.f64 d4, d1 ; CHECK-FP-NEXT: vmov.f32 s9, s3 ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q2, r0 ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 ; CHECK-FP-NEXT: vcmp.f32 s4, s0 @@ -1949,7 +1941,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r0, q0[1] +; CHECK-FP-NEXT: vmov r0, s1 ; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 ; CHECK-FP-NEXT: vcmp.f32 s8, s0 @@ -1994,7 +1986,7 @@ define arm_aapcs_vfpcc void @fmax_v4f16_acc_nofast(<4 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] @@ -2009,7 +2001,7 @@ ; ; CHECK-NOFP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov.32 r1, q0[1] +; CHECK-NOFP-NEXT: vmov r1, s1 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 ; CHECK-NOFP-NEXT: vdup.32 q1, r1 ; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 @@ -2043,7 +2035,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] @@ -2107,7 +2099,7 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.32 r1, q0[1] +; CHECK-FP-NEXT: vmov r1, s1 ; CHECK-FP-NEXT: vdup.32 q1, r1 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov.u16 r1, q0[1] diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -8,16 +8,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: ldrd r12, r3, [r0, #16] -; CHECK-NEXT: vmov.32 r0, q0[1] +; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov.f64 d2, d0 ; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.32 r2, q0[2] -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: add r2, r12 @@ -45,15 +41,15 @@ ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.32 r0, q2[2] +; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vdup.32 q4, r0 ; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.32 r0, q2[1] ; CHECK-NEXT: vmov.f64 d8, d2 -; CHECK-NEXT: vdup.32 q5, r0 ; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov r0, s9 ; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vdup.32 q5, r0 ; CHECK-NEXT: vmov.f32 s0, s6 ; CHECK-NEXT: vmov.f32 s19, s23 ; CHECK-NEXT: vmov.f32 s10, s8 @@ -84,16 +80,16 @@ ; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.32 r2, q2[2] ; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vdup.32 q4, r2 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.32 r2, q2[1] ; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vdup.32 q5, r2 ; CHECK-NEXT: vmov.f64 d8, d2 ; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov r2, s9 ; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vdup.32 q5, r2 ; CHECK-NEXT: vmov.f32 s0, s6 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s19, s23 @@ -107,15 +103,15 @@ ; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.32 r0, q3[2] +; CHECK-NEXT: vmov r0, s14 ; CHECK-NEXT: vdup.32 q5, r0 ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.32 r0, q3[1] ; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vdup.32 q6, r0 ; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov r0, s13 ; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vdup.32 q6, r0 ; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.f32 s23, s27 ; CHECK-NEXT: vmov.f32 s14, s12 @@ -148,16 +144,16 @@ ; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.32 r2, q2[2] ; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vdup.32 q4, r2 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.32 r2, q2[1] ; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vdup.32 q5, r2 ; CHECK-NEXT: vmov.f64 d8, d2 ; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov r2, s9 ; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vdup.32 q5, r2 ; CHECK-NEXT: vmov.f32 s0, s6 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s19, s23 @@ -171,16 +167,16 @@ ; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.32 r2, q3[2] +; CHECK-NEXT: vldrw.u32 q0, [r0, #128] +; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vdup.32 q5, r2 ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.32 r2, q3[1] ; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vdup.32 q6, r2 -; CHECK-NEXT: vldrw.u32 q0, [r0, #128] ; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov r2, s13 ; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vdup.32 q6, r2 ; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160] ; CHECK-NEXT: vmov.f32 s23, s27 @@ -193,15 +189,15 @@ ; CHECK-NEXT: vldrw.u32 q4, [r0, #176] ; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.32 r2, q4[2] +; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vdup.32 q6, r2 ; CHECK-NEXT: vmov.f32 s22, s11 ; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.32 r2, q4[1] ; CHECK-NEXT: vmov.f64 d12, d6 -; CHECK-NEXT: vdup.32 q7, r2 ; CHECK-NEXT: vmov.f32 s25, s15 +; CHECK-NEXT: vmov r2, s17 ; CHECK-NEXT: vmov.f32 s26, s10 +; CHECK-NEXT: vdup.32 q7, r2 ; CHECK-NEXT: vmov.f32 s8, s14 ; CHECK-NEXT: vmov.f32 s27, s31 ; CHECK-NEXT: vmov.f32 s18, s16 @@ -212,15 +208,15 @@ ; CHECK-NEXT: vadd.i32 q2, q5, q2 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] ; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.32 r0, q0[2] -; CHECK-NEXT: vmov.f32 s25, s20 -; CHECK-NEXT: vdup.32 q7, r0 -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov.32 r0, q0[1] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s25, s20 ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s26, s23 +; CHECK-NEXT: vmov.f64 d6, d8 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov.f32 s13, s19 +; CHECK-NEXT: vdup.32 q7, r0 +; CHECK-NEXT: vmov.f32 s26, s23 +; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov.f32 s27, s31 ; CHECK-NEXT: vdup.32 q7, r0 ; CHECK-NEXT: vmov.f32 s14, s22 @@ -1079,31 +1075,27 @@ define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) { ; CHECK-LABEL: vld3_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.32 r0, q2[2] -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.32 r0, q2[1] ; CHECK-NEXT: vmov.f64 d8, d2 -; CHECK-NEXT: vdup.32 q5, r0 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vmov.f32 s18, s2 ; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vmov.f32 s19, s9 ; CHECK-NEXT: vmov.f32 s10, s8 ; CHECK-NEXT: vadd.f32 q3, q4, q3 ; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s3, s11 ; CHECK-NEXT: vadd.f32 q0, q3, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <12 x float>, <12 x float>* %src, align 4 @@ -1119,25 +1111,21 @@ define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vld3_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.32 r2, q2[2] -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vdup.32 q4, r2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.32 r2, q2[1] -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vdup.32 q5, r2 ; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vmov.f32 s18, s2 ; CHECK-NEXT: vmov.f32 s0, s6 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vmov.f32 s19, s9 ; CHECK-NEXT: vmov.f32 s10, s8 ; CHECK-NEXT: vadd.f32 q3, q4, q3 ; CHECK-NEXT: vmov.f32 s2, s8 @@ -1147,25 +1135,21 @@ ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.32 r0, q3[2] -; CHECK-NEXT: vdup.32 q5, r0 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.32 r0, q3[1] ; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vdup.32 q6, r0 +; CHECK-NEXT: vmov.f32 s17, s4 ; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 ; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s23, s13 ; CHECK-NEXT: vmov.f32 s14, s12 ; CHECK-NEXT: vadd.f32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s6, s12 ; CHECK-NEXT: vmov.f32 s7, s15 ; CHECK-NEXT: vadd.f32 q1, q4, q1 ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x float>, <24 x float>* %src, align 4 @@ -1183,25 +1167,19 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.32 r2, q2[2] -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vdup.32 q4, r2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.32 r2, q2[1] -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vdup.32 q5, r2 ; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vmov.f32 s18, s2 ; CHECK-NEXT: vmov.f32 s0, s6 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vmov.f32 s19, s9 ; CHECK-NEXT: vmov.f32 s10, s8 ; CHECK-NEXT: vadd.f32 q3, q4, q3 ; CHECK-NEXT: vmov.f32 s2, s8 @@ -1210,21 +1188,15 @@ ; CHECK-NEXT: vadd.f32 q0, q3, q0 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.32 r2, q3[2] -; CHECK-NEXT: vdup.32 q5, r2 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.32 r2, q3[1] ; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vdup.32 q6, r2 -; CHECK-NEXT: vldrw.u32 q0, [r0, #128] +; CHECK-NEXT: vmov.f32 s17, s4 ; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 ; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s23, s13 ; CHECK-NEXT: vmov.f32 s14, s12 ; CHECK-NEXT: vadd.f32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s6, s12 @@ -1233,49 +1205,40 @@ ; CHECK-NEXT: vadd.f32 q1, q4, q1 ; CHECK-NEXT: vldrw.u32 q4, [r0, #176] ; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.32 r2, q4[2] -; CHECK-NEXT: vdup.32 q6, r2 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.32 r2, q4[1] ; CHECK-NEXT: vmov.f64 d12, d6 -; CHECK-NEXT: vdup.32 q7, r2 +; CHECK-NEXT: vmov.f32 s21, s8 ; CHECK-NEXT: vmov.f32 s25, s15 +; CHECK-NEXT: vmov.f32 s22, s11 ; CHECK-NEXT: vmov.f32 s26, s10 ; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s27, s31 +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vmov.f32 s23, s18 +; CHECK-NEXT: vmov.f32 s27, s17 ; CHECK-NEXT: vmov.f32 s18, s16 ; CHECK-NEXT: vadd.f32 q5, q6, q5 ; CHECK-NEXT: vmov.f32 s10, s16 ; CHECK-NEXT: vmov.f32 s11, s19 ; CHECK-NEXT: vldrw.u32 q4, [r0, #96] ; CHECK-NEXT: vadd.f32 q2, q5, q2 -; CHECK-NEXT: vldrw.u32 q5, [r0, #112] +; CHECK-NEXT: vldrw.u32 q5, [r0, #128] ; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.32 r0, q0[2] -; CHECK-NEXT: vmov.f32 s25, s20 -; CHECK-NEXT: vdup.32 q7, r0 -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov.32 r0, q0[1] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s26, s23 -; CHECK-NEXT: vmov.f32 s13, s19 -; CHECK-NEXT: vmov.f32 s27, s31 -; CHECK-NEXT: vdup.32 q7, r0 -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vmov.f32 s20, s18 -; CHECK-NEXT: vmov.f32 s15, s31 -; CHECK-NEXT: vmov.f32 s2, s0 -; CHECK-NEXT: vadd.f32 q6, q3, q6 -; CHECK-NEXT: vmov.f32 s22, s0 -; CHECK-NEXT: vmov.f32 s23, s3 -; CHECK-NEXT: vadd.f32 q0, q6, q5 -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f64 d14, d8 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s25, s12 +; CHECK-NEXT: vmov.f32 s29, s19 +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vmov.f32 s30, s14 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s31, s21 +; CHECK-NEXT: vmov.f32 s22, s20 +; CHECK-NEXT: vadd.f32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vmov.f32 s15, s23 +; CHECK-NEXT: vadd.f32 q3, q6, q3 +; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -9,29 +9,24 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s8, s3 -; CHECK-NEXT: vmov.32 r3, q1[0] ; CHECK-NEXT: vmov.f32 s10, s7 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov.f32 s12, s1 ; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vdup.32 q1, r3 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.32 r2, q0[2] -; CHECK-NEXT: vdup.32 q1, r2 -; CHECK-NEXT: str r0, [r1, #4] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: add.w r12, r2, r0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: add r0, r3 ; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: strd r0, r12, [r1] ; CHECK-NEXT: bx lr entry: %l1 = load <8 x i32>, <8 x i32>* %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -51,12 +51,12 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vmov.f32 s4, s9 ; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov.32 r0, q0[0] -; CHECK-NEXT: vdup.32 q5, r0 -; CHECK-NEXT: vmov.32 r0, q2[3] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vdup.32 q5, r0 ; CHECK-NEXT: vmov.f32 s13, s8 ; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov r0, s11 ; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: vdup.32 q2, r0 ; CHECK-NEXT: vmov.f32 s15, s17 @@ -96,15 +96,14 @@ ; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vmov.f64 d6, d1 ; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vmov.32 r2, q1[3] ; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vdup.32 q2, r2 ; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r0, q7[0] -; CHECK-NEXT: vmov.f32 s13, s23 ; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s13, s23 +; CHECK-NEXT: vmov r2, s7 ; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vdup.32 q2, r2 ; CHECK-NEXT: vmov.f32 s14, s10 ; CHECK-NEXT: vmov.f64 d4, d8 ; CHECK-NEXT: vstrw.32 q3, [r1, #80] @@ -112,16 +111,16 @@ ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: vmov.f32 s21, s4 +; CHECK-NEXT: vmov r0, s28 ; CHECK-NEXT: vmov.f32 s23, s17 ; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.32 r0, q0[0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vdup.32 q6, r0 ; CHECK-NEXT: vmov.f32 s10, s18 ; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vmov.32 r0, q4[3] ; CHECK-NEXT: vmov.f32 s22, s26 ; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s2, s6 @@ -131,6 +130,7 @@ ; CHECK-NEXT: vmov.f32 s5, s29 ; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: vmov.f32 s28, s30 +; CHECK-NEXT: vmov r0, s19 ; CHECK-NEXT: vmov.f32 s7, s18 ; CHECK-NEXT: vdup.32 q4, r0 ; CHECK-NEXT: vmov.f32 s29, s27 @@ -162,119 +162,116 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #160 ; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q7, [r0, #128] -; CHECK-NEXT: vldrw.u32 q0, [r0, #144] -; CHECK-NEXT: vstrw.32 q2, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s4, s25 +; CHECK-NEXT: vldrw.u32 q1, [r0, #144] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q3, [r0, #128] ; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: vstrw.32 q2, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vmov.f32 s5, s29 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] -; CHECK-NEXT: vmov.f32 s7, s26 -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f32 s6, s22 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d4, d15 +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vmov.f32 s8, s1 ; CHECK-NEXT: vldrw.u32 q4, [r0, #176] -; CHECK-NEXT: vmov.32 r0, q6[3] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.32 r0, q3[3] -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s9, s23 -; CHECK-NEXT: vmov.f32 s11, s31 -; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s9, s13 +; CHECK-NEXT: vldrw.u32 q6, [r0, #112] +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #160] +; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vldrw.u32 q7, [r0, #48] +; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vmov.f32 s10, s22 +; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d2, d7 +; CHECK-NEXT: vdup.32 q2, r0 +; CHECK-NEXT: vmov.f32 s5, s23 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vmov.f64 d2, d9 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s5, s31 ; CHECK-NEXT: vmov.f32 s7, s19 +; CHECK-NEXT: vmov r0, s27 ; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.32 r0, q7[0] ; CHECK-NEXT: vmov.f64 d2, d10 ; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s5, s0 ; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.32 r0, q4[0] -; CHECK-NEXT: vmov.f32 s5, s24 ; CHECK-NEXT: vmov.f32 s7, s21 ; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vmov.f64 d0, d14 ; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f32 s1, s24 +; CHECK-NEXT: vmov.f32 s3, s29 +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vmov.f32 s16, s25 ; CHECK-NEXT: vmov.f32 s6, s2 ; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s16, s13 -; CHECK-NEXT: vmov.f32 s24, s5 -; CHECK-NEXT: vmov.f32 s25, s1 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov.f32 s27, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vmov.32 r0, q3[3] -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s19, s26 +; CHECK-NEXT: vldrw.u32 q6, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s30 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.f32 s28, s5 +; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s29, s1 +; CHECK-NEXT: vmov.f32 s31, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f64 d8, d1 ; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vldrw.u32 q4, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s6 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov.f32 s30, s6 ; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d6, d1 -; CHECK-NEXT: vmov.32 r0, q4[3] -; CHECK-NEXT: vmov q7, q0 -; CHECK-NEXT: vstrw.32 q6, [r1, #112] -; CHECK-NEXT: vstrw.32 q2, [r1, #128] +; CHECK-NEXT: vmov.f64 d6, d5 +; CHECK-NEXT: vstrw.32 q7, [r1, #112] ; CHECK-NEXT: vmov.f32 s13, s7 ; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov.f32 s19, s3 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vmov r0, s27 +; CHECK-NEXT: vmov.f32 s18, s2 ; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vstrw.32 q4, [r1, #128] ; CHECK-NEXT: vmov.f32 s14, s2 ; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vmov.f32 s21, s4 ; CHECK-NEXT: vstrw.32 q3, [r1, #80] ; CHECK-NEXT: vmov.f32 s23, s1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.32 r0, q0[0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.32 r0, q7[0] +; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov.f32 s22, s2 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q5, [r1, #96] ; CHECK-NEXT: vmov.f64 d2, d0 -; CHECK-NEXT: vmov.f32 s5, s16 -; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.f32 s5, s24 +; CHECK-NEXT: vmov q6, q0 ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vdup.32 q0, r0 ; CHECK-NEXT: vmov.f32 s6, s2 ; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vmov.f32 s28, s1 -; CHECK-NEXT: vmov.f32 s31, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s30, s18 -; CHECK-NEXT: vstrw.32 q0, [r1, #144] +; CHECK-NEXT: vmov.f32 s8, s1 +; CHECK-NEXT: vmov.f32 s11, s2 ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q7, [r1, #64] +; CHECK-NEXT: vmov.f32 s10, s26 +; CHECK-NEXT: vstrw.32 q0, [r1, #144] +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [r1, #64] ; CHECK-NEXT: vstrw.32 q0, [r1, #160] ; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1, #176] @@ -347,9 +344,9 @@ ; CHECK-NEXT: vldrh.u32 q1, [r0] ; CHECK-NEXT: vldrh.u32 q3, [r0, #8] ; CHECK-NEXT: vmov.f64 d0, d5 -; CHECK-NEXT: vmov.32 r0, q3[3] -; CHECK-NEXT: vdup.32 q4, r0 ; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vdup.32 q4, r0 ; CHECK-NEXT: vmov.f32 s3, s11 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov.f32 s2, s18 @@ -411,14 +408,14 @@ ; CHECK-NEXT: vmov.u16 r0, q2[2] ; CHECK-NEXT: vmov.16 q3[6], r0 ; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.32 r2, q1[0] +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vdup.32 q5, r2 ; CHECK-NEXT: vmov.f32 s13, s8 ; CHECK-NEXT: vmov.u16 r2, q5[2] ; CHECK-NEXT: vmov.u16 r0, q3[3] ; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.32 r2, q2[3] +; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmov.u16 r0, q3[4] ; CHECK-NEXT: vmov.16 q6[4], r0 @@ -505,7 +502,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vmov.u16 r2, q2[3] ; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmov.32 r3, q0[0] +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill @@ -534,7 +531,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vmov.u16 r2, q5[3] -; CHECK-NEXT: vmov.32 r0, q0[3] +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vdup.32 q0, r0 ; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill @@ -548,13 +545,13 @@ ; CHECK-NEXT: vmov.u16 r0, q2[0] ; CHECK-NEXT: vmov.16 q6[0], r0 ; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q6[1], r0 ; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r2, q0[0] +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q6[6], r0 ; CHECK-NEXT: vmov.u16 r0, q3[2] ; CHECK-NEXT: vdup.32 q0, r2 @@ -582,7 +579,7 @@ ; CHECK-NEXT: vmov.16 q0[7], r0 ; CHECK-NEXT: vmov.f32 s2, s19 ; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r2, q1[3] +; CHECK-NEXT: vmov r2, s7 ; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vdup.32 q7, r2 ; CHECK-NEXT: vrev32.16 q3, q3 @@ -800,7 +797,7 @@ ; CHECK-NEXT: vmov.u16 r2, q1[7] ; CHECK-NEXT: vmov.16 q0[6], r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.32 r0, q3[3] +; CHECK-NEXT: vmov r0, s15 ; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vdup.32 q4, r0 ; CHECK-NEXT: vmov.f32 s2, s11 @@ -1202,30 +1199,26 @@ define void @vst3_v4f32(<4 x float> *%src, <12 x float> *%dst) { ; CHECK-LABEL: vst3_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov.32 r0, q0[0] -; CHECK-NEXT: vdup.32 q5, r0 -; CHECK-NEXT: vmov.32 r0, q2[3] -; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vmov.f64 d8, d6 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmov.f32 s19, s13 +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.f32 s18, s0 ; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vdup.32 q2, r0 -; CHECK-NEXT: vmov.f32 s15, s17 -; CHECK-NEXT: vmov.f32 s1, s19 -; CHECK-NEXT: vmov.f32 s6, s18 -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.f32 s11, s6 +; CHECK-NEXT: vmov.f32 s1, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vstrw.32 q2, [r1, #16] ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 @@ -1246,57 +1239,45 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vmov.f64 d6, d1 -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vmov.32 r2, q1[3] -; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vdup.32 q2, r2 -; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r0, q7[0] -; CHECK-NEXT: vmov.f32 s13, s23 -; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vmov.f32 s14, s10 -; CHECK-NEXT: vmov.f64 d4, d8 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f32 s9, s24 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: vmov.f32 s21, s4 -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.32 r0, q0[0] -; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vmov.f32 s10, s18 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vmov.32 r0, q4[3] -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vstrw.32 q5, [r1, #48] -; CHECK-NEXT: vmov.f32 s4, s17 +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] +; CHECK-NEXT: vldrw.u32 q6, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vmov.f64 d10, d8 +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: vmov.f32 s21, s28 +; CHECK-NEXT: vmov.f64 d14, d12 +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s29, s12 +; CHECK-NEXT: vmov.f32 s9, s27 +; CHECK-NEXT: vmov.f32 s31, s25 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s30, s0 +; CHECK-NEXT: vmov.f32 s0, s13 +; CHECK-NEXT: vstrw.32 q7, [r1, #48] +; CHECK-NEXT: vmov.f32 s3, s14 +; CHECK-NEXT: vmov.f32 s2, s26 +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s15 ; CHECK-NEXT: vstrw.32 q0, [r1, #64] -; CHECK-NEXT: vmov.f32 s5, s29 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov.f32 s28, s30 -; CHECK-NEXT: vmov.f32 s7, s18 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s29, s27 -; CHECK-NEXT: vmov.f32 s6, s26 -; CHECK-NEXT: vmov.f32 s30, s18 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vstrw.32 q7, [r1, #32] -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vstrw.32 q2, [r1, #80] +; CHECK-NEXT: vmov.f32 s12, s25 +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vmov.f32 s15, s26 +; CHECK-NEXT: vmov.f32 s5, s19 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s6, s27 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1320,123 +1301,104 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #160 ; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q7, [r0, #128] -; CHECK-NEXT: vldrw.u32 q0, [r0, #144] -; CHECK-NEXT: vstrw.32 q2, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s4, s25 -; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: vstrw.32 q2, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vmov.f32 s5, s29 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] -; CHECK-NEXT: vmov.f32 s7, s26 -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f32 s6, s22 +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #128] +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vstrw.32 q5, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] +; CHECK-NEXT: vmov.f32 s16, s1 +; CHECK-NEXT: vldrw.u32 q3, [r0, #160] +; CHECK-NEXT: vstrw.32 q5, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vmov.f32 s17, s5 +; CHECK-NEXT: vstrw.32 q3, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vldrw.u32 q7, [r0, #144] +; CHECK-NEXT: vldrw.u32 q2, [r0, #176] +; CHECK-NEXT: vstrw.32 q5, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d4, d15 -; CHECK-NEXT: vldrw.u32 q4, [r0, #176] -; CHECK-NEXT: vmov.32 r0, q6[3] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.32 r0, q3[3] -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s9, s23 -; CHECK-NEXT: vmov.f32 s11, s31 +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f64 d8, d3 +; CHECK-NEXT: vstrw.32 q5, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s17, s27 +; CHECK-NEXT: vmov.f32 s19, s7 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vmov.f64 d8, d5 +; CHECK-NEXT: vmov.f32 s17, s23 +; CHECK-NEXT: vmov.f32 s19, s11 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d8, d12 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s19, s25 +; CHECK-NEXT: vmov.f32 s18, s4 +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s1, s12 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s8, s13 +; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f64 d2, d9 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s7, s19 +; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d8, d1 ; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.32 r0, q7[0] -; CHECK-NEXT: vmov.f64 d2, d10 -; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.32 r0, q4[0] -; CHECK-NEXT: vmov.f32 s5, s24 -; CHECK-NEXT: vmov.f32 s7, s21 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s16, s13 -; CHECK-NEXT: vmov.f32 s24, s5 -; CHECK-NEXT: vmov.f32 s25, s1 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov.f32 s27, s6 +; CHECK-NEXT: vmov.f32 s20, s5 +; CHECK-NEXT: vmov.f32 s21, s1 +; CHECK-NEXT: vmov.f32 s23, s6 ; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vmov.32 r0, q3[3] -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vldrw.u32 q4, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d6, d1 -; CHECK-NEXT: vmov.32 r0, q4[3] -; CHECK-NEXT: vmov q7, q0 -; CHECK-NEXT: vstrw.32 q6, [r1, #112] -; CHECK-NEXT: vstrw.32 q2, [r1, #128] +; CHECK-NEXT: vmov.f64 d6, d15 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s19, s3 +; CHECK-NEXT: vmov q0, q6 ; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vmov.f32 s21, s4 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f32 s23, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.32 r0, q0[0] -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.32 r0, q7[0] -; CHECK-NEXT: vmov.f32 s22, s2 +; CHECK-NEXT: vstrw.32 q5, [r1, #112] +; CHECK-NEXT: vmov.f32 s15, s31 +; CHECK-NEXT: vldrw.u32 q7, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s11 +; CHECK-NEXT: vldrw.u32 q2, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s28 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s27, s1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [r1, #96] -; CHECK-NEXT: vmov.f64 d2, d0 -; CHECK-NEXT: vmov.f32 s5, s16 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vmov.f32 s28, s1 -; CHECK-NEXT: vmov.f32 s31, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s30, s18 +; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vstrw.32 q4, [r1, #128] +; CHECK-NEXT: vmov.f32 s26, s0 +; CHECK-NEXT: vstrw.32 q3, [r1, #80] +; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: vstrw.32 q6, [r1, #96] +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s28 +; CHECK-NEXT: vstrw.32 q0, [r1, #48] +; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s28, s5 ; CHECK-NEXT: vstrw.32 q0, [r1, #144] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q7, [r1, #64] +; CHECK-NEXT: vmov.f32 s31, s6 ; CHECK-NEXT: vstrw.32 q0, [r1, #160] -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s30, s10 ; CHECK-NEXT: vstrw.32 q0, [r1, #176] -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [r1, #64] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #160 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -1575,7 +1537,7 @@ ; CHECK-NEXT: vmov.16 q0[0], r3 ; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.32 r0, q5[0] +; CHECK-NEXT: vmov r0, s20 ; CHECK-NEXT: vmov.16 q0[4], r2 ; CHECK-NEXT: vdup.32 q4, r0 ; CHECK-NEXT: vmov r2, s9 @@ -1592,31 +1554,31 @@ ; CHECK-NEXT: vmovx.f16 s16, s18 ; CHECK-NEXT: vmov.16 q3[4], r0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s6 +; CHECK-NEXT: vmovx.f16 s16, s22 ; CHECK-NEXT: vmov.16 q3[5], r0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s22 +; CHECK-NEXT: vmovx.f16 s16, s6 ; CHECK-NEXT: vmovx.f16 s24, s7 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.16 q4[1], r0 ; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vmov.16 q4[3], r0 ; CHECK-NEXT: vmov r0, s24 ; CHECK-NEXT: vmovx.f16 s24, s23 ; CHECK-NEXT: vmov.16 q4[6], r0 ; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov.32 r2, q2[3] -; CHECK-NEXT: vmov.16 q4[7], r0 ; CHECK-NEXT: vdup.32 q7, r2 +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov r2, s29 ; CHECK-NEXT: vmov.f32 s18, s23 ; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vmov r2, s29 -; CHECK-NEXT: vmovx.f16 s28, s30 ; CHECK-NEXT: vmov r0, s24 ; CHECK-NEXT: vmov.16 q6[2], r2 ; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s28, s30 ; CHECK-NEXT: vmovx.f16 s4, s10 ; CHECK-NEXT: vmov.f32 s1, s13 ; CHECK-NEXT: vmov.f32 s2, s14 @@ -1698,7 +1660,7 @@ ; CHECK-NEXT: vmov.f32 s5, s12 ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r3, q0[0] +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill @@ -1710,22 +1672,24 @@ ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov.16 q3[4], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmovx.f16 s0, s18 ; CHECK-NEXT: vmov.16 q3[5], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmovx.f16 s0, s10 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: vmov.16 q1[3], r2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s19 ; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vldrw.u32 q5, [r0, #16] ; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov.32 r3, q5[3] +; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s6, s19 ; CHECK-NEXT: vmovx.f16 s0, s5 ; CHECK-NEXT: vmov r2, s0 @@ -1733,21 +1697,19 @@ ; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vmov.16 q2[2], r3 -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmov.16 q2[3], r2 ; CHECK-NEXT: vmov.16 q7[0], r3 +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] ; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov.16 q2[4], r2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q2[5], r2 ; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmov.32 r0, q3[0] +; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmovx.f16 s0, s8 ; CHECK-NEXT: vmov.16 q7[1], r2 @@ -1769,23 +1731,23 @@ ; CHECK-NEXT: vmov.16 q1[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmovx.f16 s0, s14 ; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmovx.f16 s0, s15 ; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.32 r2, q5[3] ; CHECK-NEXT: vmov.f32 s10, s15 ; CHECK-NEXT: vmovx.f16 s0, s9 ; CHECK-NEXT: vmov r0, s0