Index: llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -297,6 +297,8 @@ /// Try to select SBFX/UBFX instructions for ARM. bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned); + bool tryInsertVectorElt(SDNode *N); + // Select special operations if node forms integer ABS pattern bool tryABSOp(SDNode *N); @@ -3022,6 +3024,107 @@ CurDAG->RemoveDeadNode(N); } +bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) { + if (!Subtarget->hasMVEIntegerOps()) + return false; + + SDLoc dl(N); + + // We are trying to use VMOV/VMOVX/VINS to more efficiently lower insert and + // extracts of v8f16 and v8i16 vectors. Check that we have two adjacent + // inserts of the correct type: + SDValue Ins1 = SDValue(N, 0); + SDValue Ins2 = N->getOperand(0); + EVT VT = Ins1.getValueType(); + if (Ins2.getOpcode() != ISD::INSERT_VECTOR_ELT || !Ins2.hasOneUse() || + !isa(Ins1.getOperand(2)) || + !isa(Ins2.getOperand(2)) || + (VT != MVT::v8f16 && VT != MVT::v8i16) || (Ins2.getValueType() != VT)) + return false; + + unsigned Lane1 = Ins1.getConstantOperandVal(2); + unsigned Lane2 = Ins2.getConstantOperandVal(2); + if (Lane2 % 2 != 0 || Lane1 != Lane2 + 1) + return false; + + // If the inserted values will be able to use T/B already, leave it to the + // existing tablegen patterns. For example VCVTT/VCVTB. + SDValue Val1 = Ins1.getOperand(1); + SDValue Val2 = Ins2.getOperand(1); + if (Val1.getOpcode() == ISD::FP_ROUND || Val2.getOpcode() == ISD::FP_ROUND) + return false; + + // Check if the inserted values are both extracts. + if ((Val1.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Val1.getOpcode() == ARMISD::VGETLANEu) && + (Val2.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Val2.getOpcode() == ARMISD::VGETLANEu) && + isa(Val1.getOperand(1)) && + isa(Val2.getOperand(1)) && + (Val1.getOperand(0).getValueType() == MVT::v8f16 || + Val1.getOperand(0).getValueType() == MVT::v8i16) && + (Val2.getOperand(0).getValueType() == MVT::v8f16 || + Val2.getOperand(0).getValueType() == MVT::v8i16)) { + unsigned ExtractLane1 = Val1.getConstantOperandVal(1); + unsigned ExtractLane2 = Val2.getConstantOperandVal(1); + + // If the two extracted lanes are from the same place and adjacent, this + // simplifies into a f32 lane move. + if (Val1.getOperand(0) == Val2.getOperand(0) && ExtractLane2 % 2 == 0 && + ExtractLane1 == ExtractLane2 + 1) { + SDValue NewExt = CurDAG->getTargetExtractSubreg( + ARM::ssub_0 + ExtractLane2 / 2, dl, MVT::f32, Val1.getOperand(0)); + SDValue NewIns = CurDAG->getTargetInsertSubreg( + ARM::ssub_0 + Lane2 / 2, dl, VT, Ins2.getOperand(0), + NewExt); + ReplaceUses(Ins1, NewIns); + return true; + } + + // Else v8i16 pattern of an extract and an insert, with a optional vmovx for + // extracting odd lanes. + if (VT == MVT::v8i16) { + SDValue Inp1 = CurDAG->getTargetExtractSubreg( + ARM::ssub_0 + ExtractLane1 / 2, dl, MVT::f32, Val1.getOperand(0)); + SDValue Inp2 = CurDAG->getTargetExtractSubreg( + ARM::ssub_0 + ExtractLane2 / 2, dl, MVT::f32, Val2.getOperand(0)); + if (ExtractLane1 % 2 != 0) + Inp1 = SDValue(CurDAG->getMachineNode(ARM::VMOVH, dl, MVT::f32, Inp1), 0); + if (ExtractLane2 % 2 != 0) + Inp2 = SDValue(CurDAG->getMachineNode(ARM::VMOVH, dl, MVT::f32, Inp2), 0); + SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Inp2, Inp1); + SDValue NewIns = + CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32, + Ins2.getOperand(0), SDValue(VINS, 0)); + ReplaceUses(Ins1, NewIns); + return true; + } + } + + // The inserted values are not extracted - if they are f16 then insert them + // directly using a VINS. + if (VT == MVT::v8f16) { + auto F32RC = CurDAG->getTargetConstant(ARM::SPRRegClassID, dl, MVT::i32); + SDNode *Val1Copy = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, MVT::f32, Val1, F32RC); + SDNode *Val2Copy = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, MVT::f32, Val2, F32RC); + auto MQPRRC = CurDAG->getTargetConstant(ARM::MQPRRegClassID, dl, MVT::i32); + SDNode *VecCopy = + CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, MVT::v4f32, + Ins2.getOperand(0), MQPRRC); + + SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Val2, Val1); + SDValue NewIns = + CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32, + Ins2.getOperand(0), SDValue(VINS, 0)); + ReplaceUses(Ins1, NewIns); + return true; + } + + return false; +} + bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) { if (!Subtarget->hasV6T2Ops()) return false; @@ -3443,6 +3546,11 @@ return; } } + case ISD::INSERT_VECTOR_ELT: { + if (tryInsertVectorElt(N)) + return; + break; + } case ISD::SRL: if (tryV6T2BitfieldExtractOp(N, false)) return; Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -1924,44 +1924,6 @@ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), (f16 HPR:$src), ssub_0)>; def : Pat<(v8f16 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; - - foreach LANE = [0, 2, 4, 6] in { - defvar SSUB = !cast("ssub_"#!srl(LANE, 1)); - - // v8f16 pattern for inserting two lanes using a VINS - def : Pat<(insertelt (insertelt (v8f16 MQPR:$srcV), (f16 HPR:$src1), LANE), - (f16 HPR:$src2), !add(LANE,1)), - (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)), - (VINSH (COPY_TO_REGCLASS HPR:$src1, SPR), - (COPY_TO_REGCLASS HPR:$src2, SPR)), - SSUB), MQPR)>; - - // v8i16 pattern for extracting 2 even lane elements and inserting them using a VINS - def : Pat<(ARMinsertelt (ARMinsertelt (v8i16 MQPR:$srcV), - (ARMvgetlaneu (v8i16 MQPR:$src1), imm_even:$lane1), - LANE), - (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$lane2), - !add(LANE,1)), - (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)), - (VINSH (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), - (SSubReg_f16_reg imm_even:$lane1)), - (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src2, MQPR)), - (SSubReg_f16_reg imm_even:$lane2))), - SSUB), MQPR)>; - - // v8i16 pattern for extracting an element using VMOVX and inserting another using a VINS - def : Pat<(ARMinsertelt (ARMinsertelt (v8i16 MQPR:$srcV), - (ARMvgetlaneu (v8i16 MQPR:$src1), imm_odd:$lane1), - LANE), - (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$lane2), - !add(LANE,1)), - (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)), - (VINSH (VMOVH (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), - (SSubReg_f16_reg imm_odd:$lane1))), - (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src2, MQPR)), - (SSubReg_f16_reg imm_even:$lane2))), - SSUB), MQPR)>; - } } // end of mve_bit instructions Index: llvm/lib/Target/ARM/ARMInstrVFP.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrVFP.td +++ llvm/lib/Target/ARM/ARMInstrVFP.td @@ -798,8 +798,6 @@ Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -// AddedComplexity to use over the dual-insert MVE pattern -let AddedComplexity = 6 in def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -372,8 +372,6 @@ ; CHECK-NEXT: bpl .LBB2_8 ; CHECK-NEXT: .LBB2_7: @ %cond.load12 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vmovx.f16 s24, s20 -; CHECK-NEXT: vins.f16 s20, s24 ; CHECK-NEXT: vldr.16 s24, [r0, #6] ; CHECK-NEXT: vins.f16 s21, s24 ; CHECK-NEXT: .LBB2_8: @ %else13 @@ -418,14 +416,10 @@ ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vldr.16 s24, [r0, #2] ; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s21 -; CHECK-NEXT: vins.f16 s21, s24 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bpl .LBB2_6 ; CHECK-NEXT: .LBB2_14: @ %cond.load9 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vmovx.f16 s24, s20 -; CHECK-NEXT: vins.f16 s20, s24 ; CHECK-NEXT: vmovx.f16 s24, s21 ; CHECK-NEXT: vldr.16 s21, [r0, #4] ; CHECK-NEXT: vins.f16 s21, s24 @@ -441,14 +435,10 @@ ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vldr.16 s28, [r1, #2] ; CHECK-NEXT: vins.f16 s24, s28 -; CHECK-NEXT: vmovx.f16 s28, s25 -; CHECK-NEXT: vins.f16 s25, s28 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bpl .LBB2_11 ; CHECK-NEXT: .LBB2_17: @ %cond.load22 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vmovx.f16 s28, s24 -; CHECK-NEXT: vins.f16 s24, s28 ; CHECK-NEXT: vmovx.f16 s28, s25 ; CHECK-NEXT: vldr.16 s25, [r1, #4] ; CHECK-NEXT: vins.f16 s25, s28 @@ -456,8 +446,6 @@ ; CHECK-NEXT: bpl.w .LBB2_2 ; CHECK-NEXT: .LBB2_18: @ %cond.load25 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vmovx.f16 s28, s24 -; CHECK-NEXT: vins.f16 s24, s28 ; CHECK-NEXT: vldr.16 s28, [r1, #6] ; CHECK-NEXT: vins.f16 s25, s28 ; CHECK-NEXT: b .LBB2_2 Index: llvm/test/CodeGen/Thumb2/mve-div-expand.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-div-expand.ll +++ llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -806,27 +806,27 @@ define arm_aapcs_vfpcc <8 x half> @fdiv_f16(<8 x half> %in1, <8 x half> %in2) { ; CHECK-LABEL: fdiv_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vmovx.f16 s10, s0 -; CHECK-NEXT: vdiv.f16 s12, s10, s8 -; CHECK-NEXT: vdiv.f16 s8, s0, s4 -; CHECK-NEXT: vins.f16 s8, s12 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s14, s9 +; CHECK-NEXT: vdiv.f16 s12, s2, s0 +; CHECK-NEXT: vdiv.f16 s0, s8, s4 +; CHECK-NEXT: vins.f16 s0, s12 ; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmovx.f16 s14, s1 -; CHECK-NEXT: vdiv.f16 s9, s1, s5 ; CHECK-NEXT: vdiv.f16 s12, s14, s12 -; CHECK-NEXT: vmovx.f16 s14, s2 -; CHECK-NEXT: vins.f16 s9, s12 +; CHECK-NEXT: vdiv.f16 s1, s9, s5 +; CHECK-NEXT: vins.f16 s1, s12 ; CHECK-NEXT: vmovx.f16 s12, s6 +; CHECK-NEXT: vmovx.f16 s14, s10 +; CHECK-NEXT: vdiv.f16 s2, s10, s6 ; CHECK-NEXT: vdiv.f16 s12, s14, s12 -; CHECK-NEXT: vdiv.f16 s10, s2, s6 -; CHECK-NEXT: vins.f16 s10, s12 +; CHECK-NEXT: vmovx.f16 s14, s11 +; CHECK-NEXT: vins.f16 s2, s12 ; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vmovx.f16 s14, s3 -; CHECK-NEXT: vdiv.f16 s11, s3, s7 ; CHECK-NEXT: vdiv.f16 s12, s14, s12 -; CHECK-NEXT: vins.f16 s11, s12 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vdiv.f16 s3, s11, s7 +; CHECK-NEXT: vins.f16 s3, s12 ; CHECK-NEXT: bx lr entry: %out = fdiv <8 x half> %in1, %in2 Index: llvm/test/CodeGen/Thumb2/mve-float16regloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -1479,9 +1479,7 @@ ; CHECK-NEXT: vmov.u16 r4, q2[1] ; CHECK-NEXT: vfma.f16 q2, q6, r4 ; CHECK-NEXT: strh r4, [r5, #2] -; CHECK-NEXT: vmovx.f16 s6, s9 ; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vins.f16 s8, s6 ; CHECK-NEXT: strh r7, [r5], #4 ; CHECK-NEXT: vmov.16 q2[2], r3 ; CHECK-NEXT: le lr, .LBB17_5 Index: llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -98,19 +98,19 @@ ; CHECK-NEXT: vldr.16 s8, [r2] ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vldr.16 s8, [r2] +; CHECK-NEXT: vmov r3, s7 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vldr.16 s4, [r3] +; CHECK-NEXT: vins.f16 s0, s8 ; CHECK-NEXT: vldr.16 s1, [r2] +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr.16 s8, [r0] ; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vldr.16 s8, [r1] ; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vins.f16 s2, s8 ; CHECK-NEXT: vldr.16 s8, [r0] Index: llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -379,17 +379,17 @@ ; CHECK-NEXT: vldr.16 s8, [r1] ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vldr.16 s8, [r1] +; CHECK-NEXT: vmov r2, s7 ; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldr.16 s4, [r2] +; CHECK-NEXT: vins.f16 s0, s8 ; CHECK-NEXT: vldr.16 s1, [r1] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vldr.16 s8, [r0] +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vldr.16 s8, [r1] ; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vins.f16 s2, s8 ; CHECK-NEXT: vldr.16 s8, [r0] Index: llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -851,8 +851,6 @@ ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: bpl .LBB18_5 ; CHECK-LE-NEXT: .LBB18_4: @ %cond.load7 -; CHECK-LE-NEXT: vmovx.f16 s4, s0 -; CHECK-LE-NEXT: vins.f16 s0, s4 ; CHECK-LE-NEXT: vldr.16 s4, [r2, #6] ; CHECK-LE-NEXT: vins.f16 s1, s4 ; CHECK-LE-NEXT: .LBB18_5: @ %else8 @@ -899,13 +897,9 @@ ; CHECK-LE-NEXT: .LBB18_7: @ %cond.load1 ; CHECK-LE-NEXT: vldr.16 s4, [r2, #2] ; CHECK-LE-NEXT: vins.f16 s0, s4 -; CHECK-LE-NEXT: vmovx.f16 s4, s1 -; CHECK-LE-NEXT: vins.f16 s1, s4 ; CHECK-LE-NEXT: lsls r3, r1, #29 ; CHECK-LE-NEXT: bpl .LBB18_3 ; CHECK-LE-NEXT: .LBB18_8: @ %cond.load4 -; CHECK-LE-NEXT: vmovx.f16 s4, s0 -; CHECK-LE-NEXT: vins.f16 s0, s4 ; CHECK-LE-NEXT: vmovx.f16 s4, s1 ; CHECK-LE-NEXT: vldr.16 s1, [r2, #4] ; CHECK-LE-NEXT: vins.f16 s1, s4 @@ -948,8 +942,6 @@ ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: beq .LBB18_5 ; CHECK-BE-NEXT: .LBB18_4: @ %cond.load7 -; CHECK-BE-NEXT: vmovx.f16 s4, s0 -; CHECK-BE-NEXT: vins.f16 s0, s4 ; CHECK-BE-NEXT: vldr.16 s4, [r2, #6] ; CHECK-BE-NEXT: vins.f16 s1, s4 ; CHECK-BE-NEXT: .LBB18_5: @ %else8 @@ -996,13 +988,9 @@ ; CHECK-BE-NEXT: .LBB18_7: @ %cond.load1 ; CHECK-BE-NEXT: vldr.16 s4, [r2, #2] ; CHECK-BE-NEXT: vins.f16 s0, s4 -; CHECK-BE-NEXT: vmovx.f16 s4, s1 -; CHECK-BE-NEXT: vins.f16 s1, s4 ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB18_3 ; CHECK-BE-NEXT: .LBB18_8: @ %cond.load4 -; CHECK-BE-NEXT: vmovx.f16 s4, s0 -; CHECK-BE-NEXT: vins.f16 s0, s4 ; CHECK-BE-NEXT: vmovx.f16 s4, s1 ; CHECK-BE-NEXT: vldr.16 s1, [r2, #4] ; CHECK-BE-NEXT: vins.f16 s1, s4 @@ -1054,8 +1042,6 @@ ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: bpl .LBB19_5 ; CHECK-LE-NEXT: .LBB19_4: @ %cond.load7 -; CHECK-LE-NEXT: vmovx.f16 s4, s0 -; CHECK-LE-NEXT: vins.f16 s0, s4 ; CHECK-LE-NEXT: vldr.16 s4, [r2, #6] ; CHECK-LE-NEXT: vins.f16 s1, s4 ; CHECK-LE-NEXT: .LBB19_5: @ %else8 @@ -1102,13 +1088,9 @@ ; CHECK-LE-NEXT: .LBB19_7: @ %cond.load1 ; CHECK-LE-NEXT: vldr.16 s4, [r2, #2] ; CHECK-LE-NEXT: vins.f16 s0, s4 -; CHECK-LE-NEXT: vmovx.f16 s4, s1 -; CHECK-LE-NEXT: vins.f16 s1, s4 ; CHECK-LE-NEXT: lsls r3, r1, #29 ; CHECK-LE-NEXT: bpl .LBB19_3 ; CHECK-LE-NEXT: .LBB19_8: @ %cond.load4 -; CHECK-LE-NEXT: vmovx.f16 s4, s0 -; CHECK-LE-NEXT: vins.f16 s0, s4 ; CHECK-LE-NEXT: vmovx.f16 s4, s1 ; CHECK-LE-NEXT: vldr.16 s1, [r2, #4] ; CHECK-LE-NEXT: vins.f16 s1, s4 @@ -1151,8 +1133,6 @@ ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: beq .LBB19_5 ; CHECK-BE-NEXT: .LBB19_4: @ %cond.load7 -; CHECK-BE-NEXT: vmovx.f16 s4, s0 -; CHECK-BE-NEXT: vins.f16 s0, s4 ; CHECK-BE-NEXT: vldr.16 s4, [r2, #6] ; CHECK-BE-NEXT: vins.f16 s1, s4 ; CHECK-BE-NEXT: .LBB19_5: @ %else8 @@ -1199,13 +1179,9 @@ ; CHECK-BE-NEXT: .LBB19_7: @ %cond.load1 ; CHECK-BE-NEXT: vldr.16 s4, [r2, #2] ; CHECK-BE-NEXT: vins.f16 s0, s4 -; CHECK-BE-NEXT: vmovx.f16 s4, s1 -; CHECK-BE-NEXT: vins.f16 s1, s4 ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB19_3 ; CHECK-BE-NEXT: .LBB19_8: @ %cond.load4 -; CHECK-BE-NEXT: vmovx.f16 s4, s0 -; CHECK-BE-NEXT: vins.f16 s0, s4 ; CHECK-BE-NEXT: vmovx.f16 s4, s1 ; CHECK-BE-NEXT: vldr.16 s1, [r2, #4] ; CHECK-BE-NEXT: vins.f16 s1, s4 Index: llvm/test/CodeGen/Thumb2/mve-minmax.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-minmax.ll +++ llvm/test/CodeGen/Thumb2/mve-minmax.ll @@ -315,27 +315,27 @@ define arm_aapcs_vfpcc <8 x half> @minnm_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: minnm_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmovx.f16 s8, s0 -; CHECK-MVE-NEXT: vmovx.f16 s10, s4 -; CHECK-MVE-NEXT: vminnm.f16 s12, s10, s8 -; CHECK-MVE-NEXT: vminnm.f16 s8, s4, s0 -; CHECK-MVE-NEXT: vins.f16 s8, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s1 +; CHECK-MVE-NEXT: vmov q2, q0 +; CHECK-MVE-NEXT: vmovx.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s0, s8 ; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vminnm.f16 s9, s5, s1 +; CHECK-MVE-NEXT: vminnm.f16 s12, s2, s0 +; CHECK-MVE-NEXT: vminnm.f16 s0, s4, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s9 ; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12 +; CHECK-MVE-NEXT: vminnm.f16 s1, s5, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s10 ; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vins.f16 s9, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s2 +; CHECK-MVE-NEXT: vminnm.f16 s2, s6, s10 ; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vminnm.f16 s10, s6, s2 -; CHECK-MVE-NEXT: vins.f16 s10, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s3 ; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vminnm.f16 s11, s7, s3 +; CHECK-MVE-NEXT: vins.f16 s2, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s11 ; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vins.f16 s11, s12 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vminnm.f16 s3, s7, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s12 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: minnm_float16_t: Index: llvm/test/CodeGen/Thumb2/mve-shuffle.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -201,15 +201,15 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) { ; CHECK-LABEL: shuffle1_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vins.f16 s4, s3 -; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vins.f16 s5, s2 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vins.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s7, s0 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -228,16 +228,15 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { ; CHECK-LABEL: shuffle3_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s5, s3 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vins.f16 s5, s3 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vins.f16 s1, s0 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovx.f16 s1, s7 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s5, s4 +; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov.f32 s0, s6 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -325,31 +324,30 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) { ; CHECK-LABEL: shuffle2step_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vins.f16 s8, s1 -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vins.f16 s9, s3 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vins.f16 s10, s5 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vins.f16 s11, s7 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vadd.i16 q0, q2, q3 +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmovx.f16 s9, s2 +; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vmovx.f16 s10, s4 +; CHECK-NEXT: vmovx.f16 s16, s1 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vins.f16 s8, s16 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vins.f16 s9, s0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vins.f16 s10, s0 +; CHECK-NEXT: vmov.f32 s13, s2 +; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vmovx.f16 s11, s6 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vins.f16 s11, s0 +; CHECK-NEXT: vmov.f32 s15, s6 +; CHECK-NEXT: vadd.i16 q0, q3, q2 +; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> @@ -361,59 +359,51 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) { ; CHECK-LABEL: shuffle3step_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s8 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmovx.f16 s16, s2 +; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vins.f16 s12, s16 +; CHECK-NEXT: vmovx.f16 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s4 +; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vins.f16 s13, s16 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.f32 s18, s8 ; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmovnb.i32 q6, q4 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vins.f16 s22, s8 -; CHECK-NEXT: vmovx.f16 s23, s9 -; CHECK-NEXT: vins.f16 s23, s11 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s8, s2 -; CHECK-NEXT: vmovx.f16 s9, s3 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vins.f16 s9, s5 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmovnb.i32 q0, q2 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s11, s23 -; CHECK-NEXT: vadd.i16 q0, q3, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmovnb.i32 q5, q3 +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmovx.f16 s20, s1 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vins.f16 s16, s20 +; CHECK-NEXT: vmovx.f16 s20, s4 +; CHECK-NEXT: vmov.f32 s17, s3 +; CHECK-NEXT: vins.f16 s17, s20 +; CHECK-NEXT: vmovx.f16 s20, s7 +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vins.f16 s18, s20 +; CHECK-NEXT: vmovx.f16 s20, s10 +; CHECK-NEXT: vmov.f32 s19, s9 +; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmovx.f16 s20, s0 +; CHECK-NEXT: vins.f16 s20, s2 +; CHECK-NEXT: vmovx.f16 s21, s3 +; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s3, s9 +; CHECK-NEXT: vins.f16 s21, s5 +; CHECK-NEXT: vins.f16 s3, s11 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovnb.i32 q1, q5 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s23, s3 +; CHECK-NEXT: vadd.i16 q0, q4, q5 +; CHECK-NEXT: vadd.i16 q0, q0, q3 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> @@ -427,64 +417,53 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) { ; CHECK-LABEL: shuffle4step_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vmovx.f16 s18, s9 +; CHECK-NEXT: vins.f16 s18, s20 +; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmovx.f16 s19, s13 +; CHECK-NEXT: vins.f16 s9, s11 +; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmovx.f16 s20, s3 +; CHECK-NEXT: vmovx.f16 s16, s1 +; CHECK-NEXT: vins.f16 s13, s15 +; CHECK-NEXT: vins.f16 s16, s20 +; CHECK-NEXT: vmovx.f16 s20, s7 +; CHECK-NEXT: vmovx.f16 s17, s5 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vins.f16 s17, s20 ; CHECK-NEXT: vmov.f32 s22, s9 -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vins.f16 s22, s11 -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] ; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.16 q7[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vins.f16 s23, s15 -; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vins.f16 s5, s7 ; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmov.16 q7[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vins.f16 s20, s3 -; CHECK-NEXT: vmov.f32 s18, s8 -; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmovx.f16 s24, s10 ; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vins.f16 s18, s10 -; CHECK-NEXT: vmov.f32 s26, s30 -; CHECK-NEXT: vins.f16 s21, s7 -; CHECK-NEXT: vmov.f32 s27, s31 -; CHECK-NEXT: vmov.f32 s19, s12 -; CHECK-NEXT: vadd.i16 q5, q5, q6 -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vins.f16 s19, s14 -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vins.f16 s16, s2 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vins.f16 s17, s6 -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vmov.f32 s27, s3 -; CHECK-NEXT: vadd.i16 q0, q4, q6 +; CHECK-NEXT: vadd.i16 q4, q5, q4 +; CHECK-NEXT: vmovx.f16 s22, s8 +; CHECK-NEXT: vins.f16 s22, s24 +; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vmovx.f16 s24, s14 +; CHECK-NEXT: vmovx.f16 s23, s12 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmov.f32 s10, s8 +; CHECK-NEXT: vmov.f32 s11, s12 +; CHECK-NEXT: vins.f16 s23, s24 +; CHECK-NEXT: vmovx.f16 s24, s2 +; CHECK-NEXT: vmovx.f16 s20, s0 +; CHECK-NEXT: vins.f16 s20, s24 +; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vmovx.f16 s21, s4 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.f32 s3, s11 ; CHECK-NEXT: vadd.i16 q0, q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vadd.i16 q0, q0, q4 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> @@ -1363,30 +1342,26 @@ define arm_aapcs_vfpcc <8 x half> @shuffle2step_f16(<16 x half> %src) { ; CHECKFP-LABEL: shuffle2step_f16: ; CHECKFP: @ %bb.0: @ %entry -; CHECKFP-NEXT: .vsave {d8} -; CHECKFP-NEXT: vpush {d8} -; CHECKFP-NEXT: vmovx.f16 s16, s1 -; CHECKFP-NEXT: vmovx.f16 s12, s0 -; CHECKFP-NEXT: vmov.f32 s8, s0 -; CHECKFP-NEXT: vins.f16 s12, s16 -; CHECKFP-NEXT: vins.f16 s8, s1 -; CHECKFP-NEXT: vmovx.f16 s13, s2 -; CHECKFP-NEXT: vmovx.f16 s16, s3 -; CHECKFP-NEXT: vmov.f32 s9, s2 -; CHECKFP-NEXT: vins.f16 s13, s16 -; CHECKFP-NEXT: vins.f16 s9, s3 -; CHECKFP-NEXT: vmovx.f16 s0, s5 -; CHECKFP-NEXT: vmovx.f16 s14, s4 -; CHECKFP-NEXT: vmov.f32 s10, s4 -; CHECKFP-NEXT: vins.f16 s14, s0 -; CHECKFP-NEXT: vins.f16 s10, s5 -; CHECKFP-NEXT: vmovx.f16 s0, s7 -; CHECKFP-NEXT: vmovx.f16 s15, s6 +; CHECKFP-NEXT: vmovx.f16 s12, s1 +; CHECKFP-NEXT: vmovx.f16 s8, s0 +; CHECKFP-NEXT: vins.f16 s8, s12 +; CHECKFP-NEXT: vmovx.f16 s12, s3 +; CHECKFP-NEXT: vmovx.f16 s9, s2 +; CHECKFP-NEXT: vins.f16 s0, s1 +; CHECKFP-NEXT: vins.f16 s9, s12 +; CHECKFP-NEXT: vins.f16 s2, s3 +; CHECKFP-NEXT: vmovx.f16 s12, s5 +; CHECKFP-NEXT: vmovx.f16 s10, s4 +; CHECKFP-NEXT: vins.f16 s10, s12 +; CHECKFP-NEXT: vins.f16 s4, s5 +; CHECKFP-NEXT: vmov.f32 s1, s2 +; CHECKFP-NEXT: vmovx.f16 s12, s7 +; CHECKFP-NEXT: vmovx.f16 s11, s6 ; CHECKFP-NEXT: vins.f16 s6, s7 -; CHECKFP-NEXT: vins.f16 s15, s0 -; CHECKFP-NEXT: vmov.f32 s11, s6 -; CHECKFP-NEXT: vadd.f16 q0, q2, q3 -; CHECKFP-NEXT: vpop {d8} +; CHECKFP-NEXT: vmov.f32 s2, s4 +; CHECKFP-NEXT: vins.f16 s11, s12 +; CHECKFP-NEXT: vmov.f32 s3, s6 +; CHECKFP-NEXT: vadd.f16 q0, q0, q2 ; CHECKFP-NEXT: bx lr entry: %s1 = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> @@ -1398,8 +1373,8 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) { ; CHECKFP-LABEL: shuffle3step_f16: ; CHECKFP: @ %bb.0: @ %entry -; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECKFP-NEXT: vmovx.f16 s16, s2 ; CHECKFP-NEXT: vmov.f32 s12, s1 ; CHECKFP-NEXT: vins.f16 s12, s16 @@ -1411,32 +1386,32 @@ ; CHECKFP-NEXT: vins.f16 s19, s20 ; CHECKFP-NEXT: vmov.f32 s14, s7 ; CHECKFP-NEXT: vmovx.f16 s20, s8 -; CHECKFP-NEXT: vmovx.f16 s24, s1 +; CHECKFP-NEXT: vmov.f32 s28, s6 ; CHECKFP-NEXT: vins.f16 s14, s20 -; CHECKFP-NEXT: vmov.f32 s20, s0 -; CHECKFP-NEXT: vins.f16 s20, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s4 -; CHECKFP-NEXT: vmov.f32 s21, s3 -; CHECKFP-NEXT: vins.f16 s21, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s7 -; CHECKFP-NEXT: vmov.f32 s22, s6 -; CHECKFP-NEXT: vins.f16 s22, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s0 -; CHECKFP-NEXT: vins.f16 s24, s2 +; CHECKFP-NEXT: vmovx.f16 s20, s7 +; CHECKFP-NEXT: vins.f16 s28, s20 +; CHECKFP-NEXT: vmovx.f16 s24, s1 +; CHECKFP-NEXT: vmovx.f16 s20, s0 +; CHECKFP-NEXT: vins.f16 s0, s24 +; CHECKFP-NEXT: vins.f16 s20, s2 +; CHECKFP-NEXT: vmovx.f16 s26, s4 +; CHECKFP-NEXT: vmovx.f16 s21, s3 +; CHECKFP-NEXT: vins.f16 s3, s26 +; CHECKFP-NEXT: vins.f16 s21, s5 +; CHECKFP-NEXT: vmovx.f16 s30, s10 +; CHECKFP-NEXT: vmovx.f16 s23, s9 ; CHECKFP-NEXT: vmov.f32 s18, s8 -; CHECKFP-NEXT: vmovx.f16 s25, s3 -; CHECKFP-NEXT: vmovx.f16 s0, s10 -; CHECKFP-NEXT: vins.f16 s25, s5 +; CHECKFP-NEXT: vmov.f32 s1, s3 +; CHECKFP-NEXT: vins.f16 s9, s30 +; CHECKFP-NEXT: vins.f16 s23, s11 +; CHECKFP-NEXT: vmov.f32 s2, s28 +; CHECKFP-NEXT: vmovx.f16 s22, s6 +; CHECKFP-NEXT: vmov.f32 s3, s9 +; CHECKFP-NEXT: vins.f16 s22, s8 ; CHECKFP-NEXT: vmov.f32 s15, s19 -; CHECKFP-NEXT: vmovx.f16 s27, s9 -; CHECKFP-NEXT: vins.f16 s9, s0 -; CHECKFP-NEXT: vins.f16 s27, s11 -; CHECKFP-NEXT: vmov.f32 s23, s9 -; CHECKFP-NEXT: vmovx.f16 s26, s6 -; CHECKFP-NEXT: vins.f16 s26, s8 -; CHECKFP-NEXT: vadd.f16 q0, q5, q6 +; CHECKFP-NEXT: vadd.f16 q0, q0, q5 ; CHECKFP-NEXT: vadd.f16 q0, q0, q3 -; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECKFP-NEXT: bx lr entry: %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> @@ -1450,8 +1425,8 @@ define arm_aapcs_vfpcc <8 x half> @shuffle4step_f16(<32 x half> %src) { ; CHECKFP-LABEL: shuffle4step_f16: ; CHECKFP: @ %bb.0: @ %entry -; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} -; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} +; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECKFP-NEXT: vmovx.f16 s20, s11 ; CHECKFP-NEXT: vmovx.f16 s18, s9 ; CHECKFP-NEXT: vins.f16 s18, s20 @@ -1461,42 +1436,40 @@ ; CHECKFP-NEXT: vins.f16 s19, s20 ; CHECKFP-NEXT: vmovx.f16 s20, s3 ; CHECKFP-NEXT: vmovx.f16 s16, s1 -; CHECKFP-NEXT: vmovx.f16 s28, s10 +; CHECKFP-NEXT: vmovx.f16 s24, s10 ; CHECKFP-NEXT: vins.f16 s16, s20 -; CHECKFP-NEXT: vmovx.f16 s26, s8 ; CHECKFP-NEXT: vmovx.f16 s20, s7 ; CHECKFP-NEXT: vmovx.f16 s17, s5 -; CHECKFP-NEXT: vins.f16 s17, s20 -; CHECKFP-NEXT: vmov.f32 s22, s9 -; CHECKFP-NEXT: vins.f16 s8, s10 ; CHECKFP-NEXT: vins.f16 s13, s15 -; CHECKFP-NEXT: vins.f16 s26, s28 -; CHECKFP-NEXT: vmov.f32 s23, s13 -; CHECKFP-NEXT: vmovx.f16 s28, s14 -; CHECKFP-NEXT: vmovx.f16 s27, s12 -; CHECKFP-NEXT: vmov.f32 s10, s8 -; CHECKFP-NEXT: vins.f16 s12, s14 -; CHECKFP-NEXT: vmov.f32 s11, s12 -; CHECKFP-NEXT: vins.f16 s27, s28 +; CHECKFP-NEXT: vins.f16 s17, s20 +; CHECKFP-NEXT: vmovx.f16 s22, s8 +; CHECKFP-NEXT: vins.f16 s22, s24 +; CHECKFP-NEXT: vmovx.f16 s24, s14 +; CHECKFP-NEXT: vmovx.f16 s23, s12 ; CHECKFP-NEXT: vins.f16 s1, s3 -; CHECKFP-NEXT: vmovx.f16 s28, s2 -; CHECKFP-NEXT: vmovx.f16 s24, s0 -; CHECKFP-NEXT: vmov.f32 s20, s1 +; CHECKFP-NEXT: vins.f16 s23, s24 +; CHECKFP-NEXT: vmovx.f16 s24, s2 +; CHECKFP-NEXT: vmovx.f16 s20, s0 ; CHECKFP-NEXT: vins.f16 s5, s7 -; CHECKFP-NEXT: vins.f16 s24, s28 -; CHECKFP-NEXT: vmov.f32 s21, s5 -; CHECKFP-NEXT: vmovx.f16 s28, s6 -; CHECKFP-NEXT: vmovx.f16 s25, s4 +; CHECKFP-NEXT: vins.f16 s20, s24 +; CHECKFP-NEXT: vmovx.f16 s24, s6 +; CHECKFP-NEXT: vmovx.f16 s21, s4 +; CHECKFP-NEXT: vins.f16 s8, s10 +; CHECKFP-NEXT: vins.f16 s21, s24 +; CHECKFP-NEXT: vmov.f32 s26, s9 +; CHECKFP-NEXT: vins.f16 s12, s14 ; CHECKFP-NEXT: vins.f16 s0, s2 +; CHECKFP-NEXT: vmov.f32 s27, s13 ; CHECKFP-NEXT: vins.f16 s4, s6 -; CHECKFP-NEXT: vins.f16 s25, s28 +; CHECKFP-NEXT: vmov.f32 s24, s1 +; CHECKFP-NEXT: vmov.f32 s2, s8 +; CHECKFP-NEXT: vmov.f32 s3, s12 ; CHECKFP-NEXT: vmov.f32 s1, s4 -; CHECKFP-NEXT: vadd.f16 q1, q5, q4 -; CHECKFP-NEXT: vmov.f32 s2, s10 -; CHECKFP-NEXT: vmov.f32 s3, s11 -; CHECKFP-NEXT: vadd.f16 q0, q0, q6 -; CHECKFP-NEXT: vadd.f16 q0, q0, q1 -; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} +; CHECKFP-NEXT: vmov.f32 s25, s5 +; CHECKFP-NEXT: vadd.f16 q0, q0, q5 +; CHECKFP-NEXT: vadd.f16 q4, q6, q4 +; CHECKFP-NEXT: vadd.f16 q0, q0, q4 +; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECKFP-NEXT: bx lr entry: %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-shufflemov.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shufflemov.ll +++ llvm/test/CodeGen/Thumb2/mve-shufflemov.ll @@ -35,15 +35,15 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_76543210: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vins.f16 s4, s3 -; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vins.f16 s5, s2 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vins.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s7, s0 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-simple-arith.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-simple-arith.ll +++ llvm/test/CodeGen/Thumb2/mve-simple-arith.ll @@ -79,27 +79,27 @@ define arm_aapcs_vfpcc <8 x half> @add_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: add_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmovx.f16 s8, s0 -; CHECK-MVE-NEXT: vmovx.f16 s10, s4 -; CHECK-MVE-NEXT: vadd.f16 s12, s10, s8 -; CHECK-MVE-NEXT: vadd.f16 s8, s4, s0 -; CHECK-MVE-NEXT: vins.f16 s8, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s1 +; CHECK-MVE-NEXT: vmov q2, q0 +; CHECK-MVE-NEXT: vmovx.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s0, s8 ; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vadd.f16 s9, s5, s1 +; CHECK-MVE-NEXT: vadd.f16 s12, s2, s0 +; CHECK-MVE-NEXT: vadd.f16 s0, s4, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s9 ; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12 +; CHECK-MVE-NEXT: vadd.f16 s1, s5, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s10 ; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vins.f16 s9, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s2 +; CHECK-MVE-NEXT: vadd.f16 s2, s6, s10 ; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vadd.f16 s10, s6, s2 -; CHECK-MVE-NEXT: vins.f16 s10, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s3 ; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vadd.f16 s11, s7, s3 +; CHECK-MVE-NEXT: vins.f16 s2, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s11 ; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vins.f16 s11, s12 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vadd.f16 s3, s7, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s12 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: add_float16_t: @@ -216,27 +216,27 @@ define arm_aapcs_vfpcc <8 x half> @sub_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: sub_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmovx.f16 s8, s0 -; CHECK-MVE-NEXT: vmovx.f16 s10, s4 -; CHECK-MVE-NEXT: vsub.f16 s12, s10, s8 -; CHECK-MVE-NEXT: vsub.f16 s8, s4, s0 -; CHECK-MVE-NEXT: vins.f16 s8, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s1 +; CHECK-MVE-NEXT: vmov q2, q0 +; CHECK-MVE-NEXT: vmovx.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s0, s8 ; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vsub.f16 s9, s5, s1 +; CHECK-MVE-NEXT: vsub.f16 s12, s2, s0 +; CHECK-MVE-NEXT: vsub.f16 s0, s4, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s9 ; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12 +; CHECK-MVE-NEXT: vsub.f16 s1, s5, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s10 ; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vins.f16 s9, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s2 +; CHECK-MVE-NEXT: vsub.f16 s2, s6, s10 ; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vsub.f16 s10, s6, s2 -; CHECK-MVE-NEXT: vins.f16 s10, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s3 ; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vsub.f16 s11, s7, s3 +; CHECK-MVE-NEXT: vins.f16 s2, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s11 ; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vins.f16 s11, s12 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vsub.f16 s3, s7, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s12 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: sub_float16_t: @@ -336,27 +336,27 @@ define arm_aapcs_vfpcc <8 x half> @mul_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: mul_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmovx.f16 s8, s0 -; CHECK-MVE-NEXT: vmovx.f16 s10, s4 -; CHECK-MVE-NEXT: vmul.f16 s12, s10, s8 -; CHECK-MVE-NEXT: vmul.f16 s8, s4, s0 -; CHECK-MVE-NEXT: vins.f16 s8, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s1 +; CHECK-MVE-NEXT: vmov q2, q0 +; CHECK-MVE-NEXT: vmovx.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s0, s8 ; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vmul.f16 s9, s5, s1 +; CHECK-MVE-NEXT: vmul.f16 s12, s2, s0 +; CHECK-MVE-NEXT: vmul.f16 s0, s4, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s9 ; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12 +; CHECK-MVE-NEXT: vmul.f16 s1, s5, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s10 ; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vins.f16 s9, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s2 +; CHECK-MVE-NEXT: vmul.f16 s2, s6, s10 ; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmul.f16 s10, s6, s2 -; CHECK-MVE-NEXT: vins.f16 s10, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s3 ; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vmul.f16 s11, s7, s3 +; CHECK-MVE-NEXT: vins.f16 s2, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s11 ; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vins.f16 s11, s12 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vmul.f16 s3, s7, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s12 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: mul_float16_t: Index: llvm/test/CodeGen/Thumb2/mve-vcvt.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vcvt.ll +++ llvm/test/CodeGen/Thumb2/mve-vcvt.ll @@ -92,32 +92,32 @@ ; CHECK-MVE-LABEL: foo_half_int16: ; CHECK-MVE: @ %bb.0: @ %entry ; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmov.s16 r0, q0[1] +; CHECK-MVE-NEXT: vmov.s16 r0, q0[0] ; CHECK-MVE-NEXT: vmov s0, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[0] -; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s0 -; CHECK-MVE-NEXT: vmov s0, r0 -; CHECK-MVE-NEXT: vcvt.f16.s32 s0, s0 +; CHECK-MVE-NEXT: vmov.s16 r0, q1[1] +; CHECK-MVE-NEXT: vmov s2, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[3] +; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s2 +; CHECK-MVE-NEXT: vcvt.f16.s32 s0, s0 ; CHECK-MVE-NEXT: vins.f16 s0, s8 ; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[2] ; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 ; CHECK-MVE-NEXT: vmov s10, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[5] +; CHECK-MVE-NEXT: vmov.s16 r0, q1[4] ; CHECK-MVE-NEXT: vcvt.f16.s32 s1, s10 ; CHECK-MVE-NEXT: vins.f16 s1, s8 ; CHECK-MVE-NEXT: vmov s8, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[4] -; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 +; CHECK-MVE-NEXT: vmov.s16 r0, q1[5] +; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s8 ; CHECK-MVE-NEXT: vmov s10, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[7] -; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s10 -; CHECK-MVE-NEXT: vins.f16 s2, s8 ; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[6] -; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 +; CHECK-MVE-NEXT: vcvt.f16.s32 s10, s10 ; CHECK-MVE-NEXT: vmov s4, r0 +; CHECK-MVE-NEXT: vins.f16 s2, s10 +; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 ; CHECK-MVE-NEXT: vcvt.f16.s32 s3, s4 ; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr @@ -135,32 +135,32 @@ ; CHECK-MVE-LABEL: foo_half_uint16: ; CHECK-MVE: @ %bb.0: @ %entry ; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmov.u16 r0, q0[1] +; CHECK-MVE-NEXT: vmov.u16 r0, q0[0] ; CHECK-MVE-NEXT: vmov s0, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[0] -; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s0 -; CHECK-MVE-NEXT: vmov s0, r0 -; CHECK-MVE-NEXT: vcvt.f16.u32 s0, s0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[1] +; CHECK-MVE-NEXT: vmov s2, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[3] +; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s2 +; CHECK-MVE-NEXT: vcvt.f16.u32 s0, s0 ; CHECK-MVE-NEXT: vins.f16 s0, s8 ; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[2] ; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 ; CHECK-MVE-NEXT: vmov s10, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[5] +; CHECK-MVE-NEXT: vmov.u16 r0, q1[4] ; CHECK-MVE-NEXT: vcvt.f16.u32 s1, s10 ; CHECK-MVE-NEXT: vins.f16 s1, s8 ; CHECK-MVE-NEXT: vmov s8, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[4] -; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[5] +; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s8 ; CHECK-MVE-NEXT: vmov s10, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[7] -; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s10 -; CHECK-MVE-NEXT: vins.f16 s2, s8 ; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[6] -; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 +; CHECK-MVE-NEXT: vcvt.f16.u32 s10, s10 ; CHECK-MVE-NEXT: vmov s4, r0 +; CHECK-MVE-NEXT: vins.f16 s2, s10 +; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 ; CHECK-MVE-NEXT: vcvt.f16.u32 s3, s4 ; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr Index: llvm/test/CodeGen/Thumb2/mve-vld2.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -205,33 +205,28 @@ define void @vld2_v8i16_align1(<16 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q2, [r0] -; CHECK-NEXT: vldrb.u8 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d2, d4 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vins.f16 s4, s9 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.f32 s5, s10 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vins.f16 s5, s11 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vins.f16 s6, s1 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.f32 s7, s2 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vins.f16 s7, s3 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vadd.i16 q0, q1, q3 +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmovx.f16 s5, s2 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s5, s8 +; CHECK-NEXT: vldrb.u8 q2, [r0, #16] +; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vmovx.f16 s6, s8 +; CHECK-NEXT: vins.f16 s6, s12 +; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vmovx.f16 s7, s10 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vins.f16 s7, s12 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vins.f16 s10, s11 +; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -571,15 +566,15 @@ ; CHECK-LABEL: vld2_v4f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s4, s0 ; CHECK-NEXT: vins.f16 s4, s8 ; CHECK-NEXT: vmovx.f16 s8, s3 ; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s2, s3 ; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r0, s0 @@ -635,33 +630,29 @@ define void @vld2_v8f16_align1(<16 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld2_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vldrb.u8 q3, [r0] -; CHECK-NEXT: vldrb.u8 q1, [r0, #16] -; CHECK-NEXT: vmov.f64 d0, d6 -; CHECK-NEXT: vmovx.f16 s16, s13 -; CHECK-NEXT: vmovx.f16 s8, s12 -; CHECK-NEXT: vins.f16 s8, s16 -; CHECK-NEXT: vmovx.f16 s16, s15 -; CHECK-NEXT: vmovx.f16 s9, s14 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vins.f16 s0, s13 -; CHECK-NEXT: vins.f16 s9, s16 -; CHECK-NEXT: vmov.f32 s1, s14 -; CHECK-NEXT: vmovx.f16 s10, s4 -; CHECK-NEXT: vins.f16 s1, s15 -; CHECK-NEXT: vins.f16 s10, s12 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vmovx.f16 s11, s6 -; CHECK-NEXT: vins.f16 s2, s5 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vins.f16 s11, s12 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vadd.f16 q0, q0, q2 +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmovx.f16 s5, s2 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s5, s8 +; CHECK-NEXT: vldrb.u8 q2, [r0, #16] +; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vmovx.f16 s6, s8 +; CHECK-NEXT: vins.f16 s6, s12 +; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vmovx.f16 s7, s10 +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vins.f16 s10, s11 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vins.f16 s7, s12 +; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x half>, <16 x half>* %src, align 1 Index: llvm/test/CodeGen/Thumb2/mve-vld3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -288,63 +288,55 @@ define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vins.f16 s0, s8 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.f32 s22, s4 -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vins.f16 s1, s12 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmovnb.i32 q6, q0 -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.f32 s2, s26 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.f32 s3, s23 -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q4[5], r0 ; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmovx.f16 s20, s12 -; CHECK-NEXT: vins.f16 s20, s14 -; CHECK-NEXT: vmovx.f16 s21, s15 +; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmov.f32 s18, s12 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmovnb.i32 q5, q0 +; CHECK-NEXT: vmov.f32 s2, s22 +; CHECK-NEXT: vmovx.f16 s20, s5 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vins.f16 s16, s20 +; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vins.f16 s17, s20 +; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vins.f16 s18, s20 +; CHECK-NEXT: vmovx.f16 s20, s14 +; CHECK-NEXT: vmov.f32 s19, s13 +; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmovx.f16 s20, s4 +; CHECK-NEXT: vins.f16 s20, s6 +; CHECK-NEXT: vmovx.f16 s21, s7 +; CHECK-NEXT: vins.f16 s6, s12 +; CHECK-NEXT: vmovx.f16 s7, s13 ; CHECK-NEXT: vins.f16 s21, s9 -; CHECK-NEXT: vins.f16 s10, s4 -; CHECK-NEXT: vmovx.f16 s11, s5 +; CHECK-NEXT: vins.f16 s7, s15 ; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vins.f16 s11, s7 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vmovnb.i32 q1, q5 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s23, s11 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmovnb.i32 q2, q5 +; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vmov.f32 s23, s7 ; CHECK-NEXT: vadd.i16 q1, q4, q5 ; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x i16>, <24 x i16>* %src, align 4 @@ -364,112 +356,96 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vins.f16 s1, s12 ; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.f32 s2, s11 ; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s12 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmovnb.i32 q6, q4 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vins.f16 s22, s12 -; CHECK-NEXT: vmovx.f16 s23, s13 -; CHECK-NEXT: vins.f16 s23, s15 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vins.f16 s12, s10 -; CHECK-NEXT: vmovx.f16 s13, s11 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vins.f16 s13, s5 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmovnb.i32 q1, q3 -; CHECK-NEXT: vmov.f32 s14, s6 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.u16 r0, q5[7] -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.f32 s26, s20 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmov.f32 s18, s12 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmovnb.i32 q5, q0 +; CHECK-NEXT: vmov.f32 s2, s22 +; CHECK-NEXT: vmovx.f16 s20, s5 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vins.f16 s16, s20 +; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vins.f16 s17, s20 +; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vins.f16 s18, s20 +; CHECK-NEXT: vmovx.f16 s20, s14 +; CHECK-NEXT: vmov.f32 s19, s13 +; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmovx.f16 s20, s4 +; CHECK-NEXT: vins.f16 s20, s6 +; CHECK-NEXT: vmovx.f16 s21, s7 +; CHECK-NEXT: vins.f16 s6, s12 +; CHECK-NEXT: vmovx.f16 s7, s13 +; CHECK-NEXT: vins.f16 s21, s9 +; CHECK-NEXT: vins.f16 s7, s15 +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmovnb.i32 q2, q5 +; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s23, s7 +; CHECK-NEXT: vadd.i16 q1, q4, q5 +; CHECK-NEXT: vmovx.f16 s12, s10 +; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vins.f16 s4, s12 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmovx.f16 s16, s13 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vins.f16 s5, s16 +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmovx.f16 s20, s19 +; CHECK-NEXT: vmov.f32 s27, s18 +; CHECK-NEXT: vins.f16 s27, s20 +; CHECK-NEXT: vmov.f64 d10, d4 +; CHECK-NEXT: vins.f16 s20, s0 +; CHECK-NEXT: vmov.f32 s26, s16 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vins.f16 s21, s0 ; CHECK-NEXT: vmov q7, q6 -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] ; CHECK-NEXT: vmovnb.i32 q7, q1 -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vmov.f32 s22, s14 +; CHECK-NEXT: vins.f16 s22, s0 ; CHECK-NEXT: vmov.f32 s6, s30 -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] ; CHECK-NEXT: vmov.f32 s7, s27 -; CHECK-NEXT: vins.f16 s26, s20 -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmovx.f16 s27, s21 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q5[2] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vins.f16 s27, s23 -; CHECK-NEXT: vmovx.f16 s20, s12 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vins.f16 s20, s14 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmovx.f16 s21, s15 -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vins.f16 s21, s9 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.f32 s19, s3 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov q0, q6 -; CHECK-NEXT: vmovnb.i32 q0, q5 -; CHECK-NEXT: vmov.f32 s22, s2 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vadd.i16 q0, q4, q5 +; CHECK-NEXT: vmovx.f16 s24, s8 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vins.f16 s24, s10 +; CHECK-NEXT: vins.f16 s23, s0 +; CHECK-NEXT: vins.f16 s2, s16 +; CHECK-NEXT: vmovx.f16 s25, s11 +; CHECK-NEXT: vmovx.f16 s3, s17 +; CHECK-NEXT: vins.f16 s25, s13 +; CHECK-NEXT: vins.f16 s3, s19 +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmovnb.i32 q2, q6 +; CHECK-NEXT: vmov.f32 s26, s10 +; CHECK-NEXT: vmov.f32 s27, s3 +; CHECK-NEXT: vadd.i16 q0, q5, q6 ; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -573,56 +549,59 @@ define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) { ; CHECK-LABEL: vld3_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrb.u16 q0, [r0, #16] -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrb.u16 q1, [r0, #16] +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.16 q2[4], r2 ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vins.f16 s10, s4 ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vins.f16 s10, s0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmovx.f16 s16, s6 +; CHECK-NEXT: vmov.f32 s18, s5 +; CHECK-NEXT: vmovx.f16 s11, s5 ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmovx.f16 s11, s1 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vins.f16 s11, s3 -; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vins.f16 s18, s16 +; CHECK-NEXT: vins.f16 s11, s7 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vadd.i16 q2, q3, q2 ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.u8 r0, q0[5] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.u8 r0, q0[8] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.u8 r0, q0[11] ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.u8 r0, q0[14] ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vins.f16 s2, s0 ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.f32 s15, s2 ; CHECK-NEXT: vadd.i16 q0, q2, q3 ; CHECK-NEXT: vstrb.16 q0, [r1] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x i8>, <24 x i8>* %src, align 4 @@ -1128,13 +1107,13 @@ ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.32 q0[1], r3 ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vins.f16 s4, s2 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vadd.f16 q1, q0, q1 -; CHECK-NEXT: vins.f16 s1, s8 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vins.f16 s0, s6 +; CHECK-NEXT: vadd.f16 q1, q0, q2 ; CHECK-NEXT: vmov.f32 s0, s1 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vmov r0, s0 @@ -1156,28 +1135,26 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldrd r2, r0, [r0, #16] -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vins.f16 s4, s2 -; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: ldrd r2, r3, [r0, #16] ; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmovx.f16 s5, s3 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vins.f16 s14, s12 -; CHECK-NEXT: vmovx.f16 s12, s2 -; CHECK-NEXT: vins.f16 s1, s12 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vins.f16 s5, s9 -; CHECK-NEXT: vmov.f32 s17, s14 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vins.f16 s8, s12 -; CHECK-NEXT: vadd.f16 q1, q4, q1 +; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmovx.f16 s4, s9 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmovx.f16 s8, s8 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmovx.f16 s16, s6 +; CHECK-NEXT: vins.f16 s12, s6 +; CHECK-NEXT: vmovx.f16 s18, s5 +; CHECK-NEXT: vins.f16 s5, s16 +; CHECK-NEXT: vins.f16 s4, s18 +; CHECK-NEXT: vmovx.f16 s13, s7 +; CHECK-NEXT: vins.f16 s7, s8 +; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vins.f16 s13, s9 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vadd.f16 q1, q1, q3 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r0, s0 @@ -1198,49 +1175,49 @@ define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmov.f32 s5, s16 +; CHECK-NEXT: vmovx.f16 s24, s1 +; CHECK-NEXT: vins.f16 s5, s8 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f32 s6, s19 +; CHECK-NEXT: vmovx.f16 s26, s16 +; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vins.f16 s15, s20 +; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vins.f16 s6, s20 ; CHECK-NEXT: vmovx.f16 s20, s19 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmov.f32 s11, s18 -; CHECK-NEXT: vins.f16 s11, s20 -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmovx.f16 s20, s16 -; CHECK-NEXT: vmov.f32 s10, s16 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmov.f64 d10, d6 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s4 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vins.f16 s21, s24 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vmovx.f16 s24, s12 -; CHECK-NEXT: vins.f16 s24, s14 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmovx.f16 s25, s15 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vins.f16 s25, s5 -; CHECK-NEXT: vmovx.f16 s27, s17 -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vins.f16 s27, s19 -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vmovx.f16 s26, s6 -; CHECK-NEXT: vins.f16 s26, s16 -; CHECK-NEXT: vadd.f16 q1, q5, q6 -; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: vmov.f32 s28, s18 +; CHECK-NEXT: vmovx.f16 s30, s10 +; CHECK-NEXT: vins.f16 s28, s20 +; CHECK-NEXT: vmovx.f16 s20, s0 +; CHECK-NEXT: vins.f16 s0, s24 +; CHECK-NEXT: vins.f16 s20, s2 +; CHECK-NEXT: vmovx.f16 s21, s3 +; CHECK-NEXT: vins.f16 s3, s26 +; CHECK-NEXT: vins.f16 s21, s17 +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vmovx.f16 s23, s9 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vins.f16 s9, s30 +; CHECK-NEXT: vins.f16 s23, s11 +; CHECK-NEXT: vmovx.f16 s22, s18 +; CHECK-NEXT: vmov.f32 s2, s28 +; CHECK-NEXT: vins.f16 s22, s8 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vadd.f16 q0, q0, q5 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x half>, <24 x half>* %src, align 4 @@ -1256,89 +1233,89 @@ define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld3_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmovx.f16 s20, s19 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmov.f32 s11, s18 -; CHECK-NEXT: vins.f16 s11, s20 -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmovx.f16 s20, s16 -; CHECK-NEXT: vmov.f32 s10, s16 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmov.f64 d10, d6 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s4 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vins.f16 s21, s24 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vmovx.f16 s24, s12 -; CHECK-NEXT: vins.f16 s24, s14 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmovx.f16 s25, s15 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vins.f16 s25, s5 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmovx.f16 s27, s17 -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vins.f16 s27, s19 -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vmovx.f16 s26, s6 -; CHECK-NEXT: vins.f16 s26, s16 -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vadd.f16 q1, q5, q6 -; CHECK-NEXT: vadd.f16 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vmov.f32 s0, s17 -; CHECK-NEXT: vmovx.f16 s20, s7 -; CHECK-NEXT: vins.f16 s0, s12 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vins.f16 s1, s12 -; CHECK-NEXT: vmov.f32 s15, s6 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #64] +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmov.f32 s5, s16 +; CHECK-NEXT: vmovx.f16 s24, s1 +; CHECK-NEXT: vins.f16 s5, s8 +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmov.f32 s6, s19 +; CHECK-NEXT: vmovx.f16 s26, s16 +; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vmov.f32 s15, s10 ; CHECK-NEXT: vins.f16 s15, s20 -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmovx.f16 s20, s4 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmov.f64 d10, d8 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmov.f32 s21, s19 -; CHECK-NEXT: vins.f16 s21, s24 -; CHECK-NEXT: vmovx.f16 s24, s11 -; CHECK-NEXT: vmov.f32 s22, s10 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vmovx.f16 s24, s16 -; CHECK-NEXT: vins.f16 s24, s18 -; CHECK-NEXT: vmov.f32 s3, s15 -; CHECK-NEXT: vmovx.f16 s25, s19 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vins.f16 s25, s9 -; CHECK-NEXT: vmovx.f16 s27, s5 +; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vins.f16 s6, s20 +; CHECK-NEXT: vmovx.f16 s20, s19 +; CHECK-NEXT: vmov.f32 s28, s18 +; CHECK-NEXT: vmovx.f16 s30, s10 +; CHECK-NEXT: vins.f16 s28, s20 +; CHECK-NEXT: vmovx.f16 s20, s0 +; CHECK-NEXT: vins.f16 s0, s24 +; CHECK-NEXT: vins.f16 s20, s2 +; CHECK-NEXT: vmovx.f16 s21, s3 +; CHECK-NEXT: vins.f16 s3, s26 +; CHECK-NEXT: vins.f16 s21, s17 +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vmovx.f16 s23, s9 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vins.f16 s9, s30 +; CHECK-NEXT: vins.f16 s23, s11 +; CHECK-NEXT: vmovx.f16 s22, s18 +; CHECK-NEXT: vmov.f32 s2, s28 +; CHECK-NEXT: vins.f16 s22, s8 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vadd.f16 q0, q0, q5 +; CHECK-NEXT: vadd.f16 q1, q0, q1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vmovx.f16 s16, s2 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vins.f16 s4, s16 +; CHECK-NEXT: vmovx.f16 s16, s13 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vmovx.f16 s24, s1 ; CHECK-NEXT: vins.f16 s5, s16 -; CHECK-NEXT: vins.f16 s27, s7 -; CHECK-NEXT: vmov.f32 s23, s5 -; CHECK-NEXT: vmovx.f16 s26, s10 -; CHECK-NEXT: vins.f16 s26, s4 -; CHECK-NEXT: vadd.f16 q1, q5, q6 -; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vmov.f32 s28, s14 +; CHECK-NEXT: vins.f16 s6, s20 +; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vins.f16 s28, s20 +; CHECK-NEXT: vmovx.f16 s20, s0 +; CHECK-NEXT: vins.f16 s0, s24 +; CHECK-NEXT: vins.f16 s20, s2 +; CHECK-NEXT: vmovx.f16 s21, s3 +; CHECK-NEXT: vmovx.f16 s26, s12 +; CHECK-NEXT: vins.f16 s21, s13 +; CHECK-NEXT: vins.f16 s3, s26 +; CHECK-NEXT: vmovx.f16 s30, s10 +; CHECK-NEXT: vmovx.f16 s23, s9 +; CHECK-NEXT: vmov.f32 s18, s8 +; CHECK-NEXT: vins.f16 s9, s30 +; CHECK-NEXT: vins.f16 s23, s11 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmovx.f16 s22, s14 +; CHECK-NEXT: vmov.f32 s2, s28 +; CHECK-NEXT: vins.f16 s22, s8 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f32 s7, s19 +; CHECK-NEXT: vadd.f16 q0, q0, q5 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %l1 = load <48 x half>, <48 x half>* %src, align 4 Index: llvm/test/CodeGen/Thumb2/mve-vld4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -388,69 +388,58 @@ define void @vld4_v8i16_align1(<32 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld4_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrb.u8 q3, [r0] -; CHECK-NEXT: vldrb.u8 q0, [r0, #32] -; CHECK-NEXT: vldrb.u8 q1, [r0, #48] -; CHECK-NEXT: vldrb.u8 q2, [r0, #16] -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.f32 s18, s1 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vins.f16 s18, s3 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.f32 s19, s5 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vins.f16 s19, s7 -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.f32 s16, s13 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vins.f16 s16, s15 -; CHECK-NEXT: vmov.f32 s17, s9 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vins.f16 s17, s11 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vadd.i16 q4, q4, q5 -; CHECK-NEXT: vmov.f64 d11, d0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vins.f16 s22, s2 -; CHECK-NEXT: vmov.f32 s23, s4 -; CHECK-NEXT: vins.f16 s23, s6 -; CHECK-NEXT: vmov.f32 s20, s12 -; CHECK-NEXT: vins.f16 s20, s14 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vins.f16 s21, s10 -; CHECK-NEXT: vmov.f32 s26, s14 -; CHECK-NEXT: vmov.f32 s27, s15 -; CHECK-NEXT: vadd.i16 q0, q5, q6 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-NEXT: vldrb.u8 q1, [r0, #32] +; CHECK-NEXT: vldrb.u8 q2, [r0, #48] +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vmovx.f16 s18, s5 +; CHECK-NEXT: vins.f16 s18, s0 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vins.f16 s5, s7 +; CHECK-NEXT: vins.f16 s19, s0 +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vins.f16 s9, s11 +; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vmovx.f16 s16, s1 +; CHECK-NEXT: vins.f16 s16, s12 +; CHECK-NEXT: vldrb.u8 q3, [r0, #16] +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmovx.f16 s17, s13 +; CHECK-NEXT: vins.f16 s17, s20 +; CHECK-NEXT: vmov.f32 s22, s5 +; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vins.f16 s13, s15 +; CHECK-NEXT: vmov.f32 s20, s1 +; CHECK-NEXT: vmov.f32 s21, s13 +; CHECK-NEXT: vadd.i16 q4, q5, q4 +; CHECK-NEXT: vmovx.f16 s22, s4 +; CHECK-NEXT: vins.f16 s22, s24 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s24, s10 +; CHECK-NEXT: vmovx.f16 s23, s8 +; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vmov.f32 s6, s4 +; CHECK-NEXT: vmov.f32 s7, s8 +; CHECK-NEXT: vins.f16 s23, s24 +; CHECK-NEXT: vmovx.f16 s24, s2 +; CHECK-NEXT: vmovx.f16 s20, s0 +; CHECK-NEXT: vins.f16 s20, s24 +; CHECK-NEXT: vmovx.f16 s24, s14 +; CHECK-NEXT: vmovx.f16 s21, s12 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vmov.f32 s1, s12 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vadd.i16 q0, q0, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i16>, <32 x i16>* %src, align 1 @@ -1082,13 +1071,13 @@ ; CHECK-NEXT: vmovx.f16 s8, s1 ; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vmovx.f16 s4, s2 ; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vadd.f16 q1, q1, q2 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vins.f16 s12, s8 +; CHECK-NEXT: vins.f16 s12, s4 ; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmov.f32 s4, s1 ; CHECK-NEXT: vadd.f16 q0, q0, q3 +; CHECK-NEXT: vadd.f16 q1, q1, q2 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] @@ -1109,36 +1098,36 @@ define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) { ; CHECK-LABEL: vld4_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vmovx.f16 s8, s1 ; CHECK-NEXT: vins.f16 s8, s4 ; CHECK-NEXT: vldrh.u16 q1, [r0, #16] -; CHECK-NEXT: vmovx.f16 s16, s7 -; CHECK-NEXT: vmovx.f16 s9, s5 -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vins.f16 s9, s16 -; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vadd.f16 q2, q3, q2 +; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmovx.f16 s9, s5 +; CHECK-NEXT: vins.f16 s9, s12 ; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vins.f16 s12, s16 -; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s5, s7 ; CHECK-NEXT: vmovx.f16 s16, s6 ; CHECK-NEXT: vmovx.f16 s13, s4 -; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vins.f16 s13, s16 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s17, s5 ; CHECK-NEXT: vadd.f16 q0, q0, q3 +; CHECK-NEXT: vadd.f16 q2, q4, q2 ; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: vpop {d8} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x half>, <16 x half>* %src, align 2 @@ -1233,58 +1222,56 @@ define void @vld4_v8f16_align1(<32 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld4_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vldrb.u8 q2, [r0, #32] -; CHECK-NEXT: vldrb.u8 q3, [r0, #48] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrb.u8 q0, [r0, #32] +; CHECK-NEXT: vldrb.u8 q2, [r0, #48] +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vmovx.f16 s18, s1 +; CHECK-NEXT: vins.f16 s18, s4 ; CHECK-NEXT: vmovx.f16 s4, s11 -; CHECK-NEXT: vmovx.f16 s2, s9 -; CHECK-NEXT: vins.f16 s2, s4 -; CHECK-NEXT: vmovx.f16 s4, s15 -; CHECK-NEXT: vmovx.f16 s3, s13 -; CHECK-NEXT: vins.f16 s9, s11 -; CHECK-NEXT: vins.f16 s3, s4 +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vins.f16 s19, s4 ; CHECK-NEXT: vldrb.u8 q1, [r0] -; CHECK-NEXT: vmovx.f16 s28, s10 -; CHECK-NEXT: vmovx.f16 s26, s8 -; CHECK-NEXT: vmovx.f16 s16, s7 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vins.f16 s0, s16 -; CHECK-NEXT: vldrb.u8 q4, [r0, #16] +; CHECK-NEXT: vmovx.f16 s24, s2 +; CHECK-NEXT: vins.f16 s9, s11 +; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmovx.f16 s16, s5 +; CHECK-NEXT: vins.f16 s16, s12 +; CHECK-NEXT: vldrb.u8 q3, [r0, #16] +; CHECK-NEXT: vins.f16 s5, s7 +; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmovx.f16 s17, s13 +; CHECK-NEXT: vins.f16 s17, s20 +; CHECK-NEXT: vmovx.f16 s22, s0 +; CHECK-NEXT: vins.f16 s22, s24 +; CHECK-NEXT: vmovx.f16 s24, s10 +; CHECK-NEXT: vmovx.f16 s23, s8 ; CHECK-NEXT: vins.f16 s13, s15 -; CHECK-NEXT: vins.f16 s26, s28 -; CHECK-NEXT: vmovx.f16 s20, s19 -; CHECK-NEXT: vmovx.f16 s1, s17 -; CHECK-NEXT: vins.f16 s1, s20 -; CHECK-NEXT: vmov.f32 s22, s9 +; CHECK-NEXT: vins.f16 s23, s24 +; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vmovx.f16 s20, s4 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s20, s24 +; CHECK-NEXT: vmovx.f16 s24, s14 +; CHECK-NEXT: vmovx.f16 s21, s12 ; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmovx.f16 s28, s14 -; CHECK-NEXT: vmovx.f16 s27, s12 -; CHECK-NEXT: vmov.f32 s10, s8 -; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vmov.f32 s11, s12 -; CHECK-NEXT: vins.f16 s27, s28 -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmovx.f16 s28, s6 -; CHECK-NEXT: vmovx.f16 s24, s4 -; CHECK-NEXT: vmov.f32 s20, s5 -; CHECK-NEXT: vins.f16 s17, s19 -; CHECK-NEXT: vins.f16 s24, s28 -; CHECK-NEXT: vmov.f32 s21, s17 -; CHECK-NEXT: vmovx.f16 s28, s18 -; CHECK-NEXT: vmovx.f16 s25, s16 +; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vmov.f32 s26, s1 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vins.f16 s25, s28 -; CHECK-NEXT: vmov.f32 s5, s16 -; CHECK-NEXT: vadd.f16 q0, q5, q0 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vadd.f16 q1, q1, q6 -; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: vmov.f32 s27, s9 +; CHECK-NEXT: vmov.f32 s24, s5 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s8 +; CHECK-NEXT: vmov.f32 s25, s13 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vadd.f16 q4, q6, q4 +; CHECK-NEXT: vadd.f16 q0, q1, q5 +; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x half>, <32 x half>* %src, align 1 Index: llvm/test/CodeGen/Thumb2/mve-vldst4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -23,104 +23,104 @@ ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q3, [r0, #32] +; CHECK-NEXT: vldrh.u16 q4, [r0, #32] ; CHECK-NEXT: vldrh.u16 q5, [r0, #48] -; CHECK-NEXT: vldrh.u16 q4, [r0], #64 -; CHECK-NEXT: vmovx.f16 s4, s15 -; CHECK-NEXT: vmovx.f16 s2, s13 -; CHECK-NEXT: vins.f16 s2, s4 -; CHECK-NEXT: vmovx.f16 s4, s23 -; CHECK-NEXT: vmovx.f16 s3, s21 -; CHECK-NEXT: vldrh.u16 q7, [r0, #-48] -; CHECK-NEXT: vins.f16 s3, s4 -; CHECK-NEXT: vmovx.f16 s4, s19 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vins.f16 s13, s15 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmovx.f16 s4, s31 -; CHECK-NEXT: vmovx.f16 s1, s29 -; CHECK-NEXT: vins.f16 s21, s23 -; CHECK-NEXT: vins.f16 s1, s4 -; CHECK-NEXT: vins.f16 s17, s19 -; CHECK-NEXT: vmul.f16 q1, q0, r2 -; CHECK-NEXT: vmov.f32 s2, s13 +; CHECK-NEXT: vldrh.u16 q3, [r0], #64 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmovx.f16 s8, s19 +; CHECK-NEXT: vldrh.u16 q6, [r0, #-48] +; CHECK-NEXT: vins.f16 s2, s19 ; CHECK-NEXT: vmov.f32 s3, s21 -; CHECK-NEXT: vins.f16 s29, s31 -; CHECK-NEXT: vmov.f32 s0, s17 -; CHECK-NEXT: vmov.f32 s1, s29 -; CHECK-NEXT: vmul.f16 q2, q0, r2 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmovx.f16 s27, s8 -; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vmov.f32 s25, s8 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vins.f16 s27, s0 -; CHECK-NEXT: vmovx.f16 s2, s12 -; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s24, s14 -; CHECK-NEXT: vins.f16 s2, s24 -; CHECK-NEXT: vmovx.f16 s24, s22 -; CHECK-NEXT: vmovx.f16 s3, s20 -; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vins.f16 s3, s24 -; CHECK-NEXT: vmovx.f16 s24, s18 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vins.f16 s20, s22 -; CHECK-NEXT: vins.f16 s0, s24 -; CHECK-NEXT: vmovx.f16 s24, s30 -; CHECK-NEXT: vmovx.f16 s1, s28 +; CHECK-NEXT: vmovx.f16 s5, s25 +; CHECK-NEXT: vins.f16 s3, s23 +; CHECK-NEXT: vmovx.f16 s6, s17 +; CHECK-NEXT: vmov.f32 s0, s13 +; CHECK-NEXT: vins.f16 s6, s8 +; CHECK-NEXT: vmovx.f16 s8, s23 +; CHECK-NEXT: vmovx.f16 s7, s21 +; CHECK-NEXT: vins.f16 s0, s15 +; CHECK-NEXT: vins.f16 s7, s8 +; CHECK-NEXT: vmovx.f16 s8, s15 +; CHECK-NEXT: vmovx.f16 s4, s13 +; CHECK-NEXT: vins.f16 s25, s27 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmovx.f16 s8, s27 +; CHECK-NEXT: vins.f16 s5, s8 +; CHECK-NEXT: vmov.f32 s1, s25 +; CHECK-NEXT: vmul.f16 q2, q1, r2 +; CHECK-NEXT: vmul.f16 q0, q0, r2 +; CHECK-NEXT: vmovx.f16 s7, s0 +; CHECK-NEXT: vmovx.f16 s28, s8 +; CHECK-NEXT: vins.f16 s7, s28 +; CHECK-NEXT: vmovx.f16 s30, s16 +; CHECK-NEXT: vmovx.f16 s31, s20 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s28, s12 ; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vins.f16 s1, s24 -; CHECK-NEXT: vins.f16 s28, s30 -; CHECK-NEXT: vmul.f16 q6, q0, r2 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vmov.f32 s3, s20 -; CHECK-NEXT: vmov.f32 s17, s28 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s19, s3 -; CHECK-NEXT: vmovx.f16 s2, s24 -; CHECK-NEXT: vmul.f16 q5, q4, r2 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmov q3, q5 -; CHECK-NEXT: vmov.f32 s14, s0 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vins.f16 s9, s5 -; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov.f32 s19, s0 -; CHECK-NEXT: vmovx.f16 s31, s10 -; CHECK-NEXT: vmovx.f16 s18, s21 -; CHECK-NEXT: vins.f16 s21, s25 -; CHECK-NEXT: vins.f16 s10, s6 -; CHECK-NEXT: vmov.f32 s16, s21 -; CHECK-NEXT: vmovx.f16 s0, s25 -; CHECK-NEXT: vmov.f32 s29, s10 -; CHECK-NEXT: vins.f16 s18, s0 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vins.f16 s31, s0 -; CHECK-NEXT: vmovx.f16 s0, s26 -; CHECK-NEXT: vmovx.f16 s30, s22 -; CHECK-NEXT: vins.f16 s22, s26 -; CHECK-NEXT: vmov.f32 s28, s22 -; CHECK-NEXT: vins.f16 s30, s0 -; CHECK-NEXT: vmovx.f16 s3, s11 -; CHECK-NEXT: vins.f16 s11, s7 -; CHECK-NEXT: vstrh.16 q7, [r1, #32] -; CHECK-NEXT: vmov.f32 s1, s11 -; CHECK-NEXT: vins.f16 s3, s4 -; CHECK-NEXT: vmovx.f16 s4, s27 -; CHECK-NEXT: vmovx.f16 s2, s23 -; CHECK-NEXT: vins.f16 s23, s27 -; CHECK-NEXT: vmov.f32 s0, s23 -; CHECK-NEXT: vins.f16 s2, s4 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrh.16 q0, [r1, #48] -; CHECK-NEXT: vmov.f32 s13, s5 -; CHECK-NEXT: vmov.f32 s15, s7 -; CHECK-NEXT: vstrh.16 q3, [r1], #64 +; CHECK-NEXT: vmovx.f16 s29, s24 +; CHECK-NEXT: vmovx.f16 s4, s18 +; CHECK-NEXT: vins.f16 s20, s22 +; CHECK-NEXT: vins.f16 s30, s4 +; CHECK-NEXT: vmovx.f16 s4, s22 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vins.f16 s31, s4 +; CHECK-NEXT: vmovx.f16 s4, s14 +; CHECK-NEXT: vmov.f32 s14, s16 +; CHECK-NEXT: vins.f16 s24, s26 +; CHECK-NEXT: vmov.f32 s15, s20 +; CHECK-NEXT: vins.f16 s28, s4 +; CHECK-NEXT: vmovx.f16 s4, s26 +; CHECK-NEXT: vmov.f32 s13, s24 +; CHECK-NEXT: vins.f16 s29, s4 +; CHECK-NEXT: vmul.f16 q3, q3, r2 +; CHECK-NEXT: vmul.f16 q7, q7, r2 +; CHECK-NEXT: vmovx.f16 s4, s12 +; CHECK-NEXT: vmovx.f16 s6, s28 +; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vmovx.f16 s5, s9 +; CHECK-NEXT: vins.f16 s12, s28 +; CHECK-NEXT: vins.f16 s6, s5 +; CHECK-NEXT: vmovx.f16 s18, s13 +; CHECK-NEXT: vmovx.f16 s5, s29 +; CHECK-NEXT: vins.f16 s1, s9 +; CHECK-NEXT: vins.f16 s18, s5 +; CHECK-NEXT: vmovx.f16 s23, s2 +; CHECK-NEXT: vmovx.f16 s5, s10 +; CHECK-NEXT: vins.f16 s2, s10 +; CHECK-NEXT: vins.f16 s23, s5 +; CHECK-NEXT: vins.f16 s13, s29 +; CHECK-NEXT: vmovx.f16 s27, s3 +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vmovx.f16 s22, s14 +; CHECK-NEXT: vins.f16 s27, s8 +; CHECK-NEXT: vins.f16 s14, s30 +; CHECK-NEXT: vmovx.f16 s26, s15 +; CHECK-NEXT: vins.f16 s15, s31 +; CHECK-NEXT: vmovx.f16 s8, s31 +; CHECK-NEXT: vins.f16 s3, s11 +; CHECK-NEXT: vins.f16 s26, s8 +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmovx.f16 s5, s30 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s29, s0 +; CHECK-NEXT: vins.f16 s22, s5 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.f32 s11, s31 +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vmov.f32 s31, s6 +; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vmov.f32 s21, s2 +; CHECK-NEXT: vmov.f32 s25, s3 +; CHECK-NEXT: vmov.f32 s17, s29 +; CHECK-NEXT: vmov.f32 s20, s14 +; CHECK-NEXT: vmov.f32 s24, s15 +; CHECK-NEXT: vstrh.16 q5, [r1, #32] +; CHECK-NEXT: vstrh.16 q6, [r1, #48] +; CHECK-NEXT: vstrh.16 q2, [r1], #64 +; CHECK-NEXT: vmov.f32 s19, s31 ; CHECK-NEXT: vstrh.16 q4, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end Index: llvm/test/CodeGen/Thumb2/mve-vst2.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -207,34 +207,34 @@ define void @vst2_v8i16_align1(<8 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vst2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d3 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vins.f16 s8, s2 -; CHECK-NEXT: vmov.f64 d6, d2 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vins.f16 s12, s0 -; CHECK-NEXT: vins.f16 s10, s3 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vstrb.8 q2, [r1, #16] -; CHECK-NEXT: vins.f16 s14, s1 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vstrb.8 q3, [r1] +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmovx.f16 s1, s10 +; CHECK-NEXT: vmovx.f16 s12, s6 +; CHECK-NEXT: vins.f16 s1, s12 +; CHECK-NEXT: vins.f16 s10, s6 +; CHECK-NEXT: vmov.f32 s0, s10 +; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmovx.f16 s3, s11 +; CHECK-NEXT: vins.f16 s11, s7 +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vmovx.f16 s14, s4 +; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vstrb.8 q0, [r1, #16] +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vins.f16 s9, s5 +; CHECK-NEXT: vmov.f32 s18, s9 +; CHECK-NEXT: vins.f16 s19, s4 +; CHECK-NEXT: vstrb.8 q4, [r1] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -520,10 +520,10 @@ ; CHECK-NEXT: vins.f16 s4, s0 ; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: str r0, [r1, #4] ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r2, s5 ; CHECK-NEXT: str r0, [r1] -; CHECK-NEXT: str r2, [r1, #4] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 @@ -540,22 +540,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r2, r12, [r0] ; CHECK-NEXT: ldrd r3, r0, [r0, #8] -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.32 q0[0], r3 -; CHECK-NEXT: vmov.32 q1[1], r12 -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.32 q0[1], r12 +; CHECK-NEXT: vmov.32 q1[1], r0 ; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vins.f16 s12, s8 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov.f32 s9, s12 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmovx.f16 s11, s5 -; CHECK-NEXT: vins.f16 s5, s1 -; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vins.f16 s11, s0 -; CHECK-NEXT: vstrh.16 q2, [r1] +; CHECK-NEXT: vmovx.f16 s10, s4 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vmovx.f16 s10, s1 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vins.f16 s1, s5 +; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vstrh.16 q1, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 @@ -610,30 +611,31 @@ define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vst2_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmovx.f16 s9, s6 -; CHECK-NEXT: vins.f16 s6, s2 -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmovx.f16 s12, s2 -; CHECK-NEXT: vins.f16 s9, s12 -; CHECK-NEXT: vmovx.f16 s12, s3 -; CHECK-NEXT: vmovx.f16 s11, s7 -; CHECK-NEXT: vins.f16 s7, s3 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vins.f16 s11, s12 +; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vmovx.f16 s12, s10 +; CHECK-NEXT: vins.f16 s1, s12 +; CHECK-NEXT: vins.f16 s6, s10 +; CHECK-NEXT: vmovx.f16 s3, s7 +; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vins.f16 s7, s11 +; CHECK-NEXT: vins.f16 s3, s12 ; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vstrb.8 q2, [r1, #16] -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vins.f16 s12, s8 +; CHECK-NEXT: vmovx.f16 s14, s8 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmovx.f16 s14, s5 +; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vmovx.f16 s8, s9 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vins.f16 s14, s8 ; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vmov.f32 s9, s12 -; CHECK-NEXT: vmovx.f16 s11, s5 -; CHECK-NEXT: vins.f16 s5, s1 ; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vins.f16 s11, s0 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vstrb.8 q0, [r1, #16] ; CHECK-NEXT: vstrb.8 q2, [r1] ; CHECK-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-vst3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -347,77 +347,64 @@ define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) { ; CHECK-LABEL: vst3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vmov.f64 d0, d6 ; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmovx.f16 s20, s12 ; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.f32 s8, s13 +; CHECK-NEXT: vins.f16 s8, s5 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.f32 s18, s12 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vins.f16 s3, s5 -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.f32 s1, s21 +; CHECK-NEXT: vmov.f32 s3, s8 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f32 s1, s12 +; CHECK-NEXT: vmov.f32 s17, s8 +; CHECK-NEXT: vmov.f32 s18, s8 +; CHECK-NEXT: vins.f16 s17, s20 +; CHECK-NEXT: vmovx.f16 s20, s18 +; CHECK-NEXT: vins.f16 s2, s20 +; CHECK-NEXT: vmovx.f16 s20, s10 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.f32 s1, s17 +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vmovx.f16 s16, s6 +; CHECK-NEXT: vins.f16 s16, s20 +; CHECK-NEXT: vmovx.f16 s20, s11 ; CHECK-NEXT: vins.f16 s17, s7 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.f32 s2, s22 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.u16 r2, q5[2] -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vmovx.f16 s19, s7 ; CHECK-NEXT: vrev32.16 q1, q1 -; CHECK-NEXT: vmov.f32 s21, s13 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.f32 s22, s10 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.f32 s17, s25 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s21, s9 +; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s18, s11 +; CHECK-NEXT: vmovx.f16 s24, s17 +; CHECK-NEXT: vmov.f32 s22, s15 +; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vmovx.f16 s24, s22 +; CHECK-NEXT: vins.f16 s18, s24 +; CHECK-NEXT: vmov.f32 s8, s9 +; CHECK-NEXT: vmov.f32 s22, s18 +; CHECK-NEXT: vmov.f32 s17, s21 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmovx.f16 s20, s13 +; CHECK-NEXT: vins.f16 s8, s20 +; CHECK-NEXT: vmovx.f16 s20, s14 +; CHECK-NEXT: vins.f16 s10, s20 ; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.f32 s22, s10 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vstrw.32 q5, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov.f32 s11, s10 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s5, s12 +; CHECK-NEXT: vmovx.f16 s12, s6 +; CHECK-NEXT: vins.f16 s10, s12 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -438,164 +425,132 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #112 -; CHECK-NEXT: sub sp, #112 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vmov.f64 d12, d2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s24, s0 -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov.f32 s27, s5 -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vins.f16 s27, s1 -; CHECK-NEXT: vmov.f32 s13, s4 -; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmov.f32 s25, s8 -; CHECK-NEXT: vmov.u16 r3, q3[2] -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.u16 r2, q6[3] -; CHECK-NEXT: vmov.16 q3[2], r3 -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov.u16 r2, q6[4] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vins.f16 s21, s3 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r3, q2[2] -; CHECK-NEXT: vmov.f32 s22, s7 -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u16 r2, q5[3] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.u16 r2, q5[4] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s16, s0 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vstrw.32 q5, [r1, #80] -; CHECK-NEXT: vmov.f32 s19, s9 -; CHECK-NEXT: vins.f16 s19, s1 -; CHECK-NEXT: vmov.f32 s17, s8 -; CHECK-NEXT: vmov.f32 s9, s28 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.f32 s10, s28 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.f32 s17, s5 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q7[5] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vins.f16 s9, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q7[7] -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f32 s10, s31 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.f32 s25, s13 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s14 -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vrev32.16 q1, q1 -; CHECK-NEXT: vmov.u16 r0, q7[2] +; CHECK-NEXT: .pad #80 +; CHECK-NEXT: sub sp, #80 +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vldrw.u32 q6, [r0, #32] +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vins.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vins.f16 s9, s7 ; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.f32 s9, s1 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q7[4] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s5, s29 -; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s6, s14 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vmov.u16 r2, q7[2] -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vstrw.32 q6, [r1, #48] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q7[5] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vldrw.u32 q7, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vmov.f32 s6, s14 -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [r1, #64] -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q7[3] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q7[5] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.f32 s1, s13 -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s30 -; CHECK-NEXT: vrev32.16 q3, q3 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q7[2], r2 -; CHECK-NEXT: vmov.16 q7[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmovx.f16 s11, s7 +; CHECK-NEXT: vmov.u16 r2, q6[1] +; CHECK-NEXT: vins.f16 s11, s0 +; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vmovx.f16 s4, s9 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vins.f16 s18, s4 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s28, s4 +; CHECK-NEXT: vins.f16 s8, s24 +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vins.f16 s11, s25 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s5, s20 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s6, s20 +; CHECK-NEXT: vins.f16 s5, s28 +; CHECK-NEXT: vmovx.f16 s28, s6 +; CHECK-NEXT: vins.f16 s10, s28 +; CHECK-NEXT: vmov.f64 d14, d8 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s0, s17 +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: vins.f16 s28, s4 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vins.f16 s0, s5 ; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q7[5], r0 -; CHECK-NEXT: vmov.f32 s1, s29 +; CHECK-NEXT: vmov.f32 s31, s0 +; CHECK-NEXT: vmovx.f16 s4, s16 +; CHECK-NEXT: vmov.f32 s1, s12 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmov.f32 s29, s16 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s30, s4 +; CHECK-NEXT: vmovx.f16 s4, s22 ; CHECK-NEXT: vmov.f32 s2, s30 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #112 +; CHECK-NEXT: vmov.f32 s29, s1 +; CHECK-NEXT: vmov.f32 s12, s13 +; CHECK-NEXT: vmov.f32 s30, s2 +; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s23 +; CHECK-NEXT: vins.f16 s1, s27 +; CHECK-NEXT: vstrw.32 q7, [r1, #48] +; CHECK-NEXT: vmovx.f16 s3, s27 +; CHECK-NEXT: vins.f16 s3, s4 +; CHECK-NEXT: vmov.f32 s5, s19 +; CHECK-NEXT: vmov.f32 s2, s23 +; CHECK-NEXT: vmovx.f16 s24, s1 +; CHECK-NEXT: vmov.f32 s6, s19 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s5, s24 +; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vins.f16 s2, s24 +; CHECK-NEXT: vmovx.f16 s24, s17 +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vins.f16 s12, s24 +; CHECK-NEXT: vmovx.f16 s24, s18 +; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vins.f16 s14, s24 +; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s15, s14 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmovx.f16 s16, s13 +; CHECK-NEXT: vrev32.16 q6, q6 +; CHECK-NEXT: vmov.f32 s20, s21 +; CHECK-NEXT: vins.f16 s25, s16 +; CHECK-NEXT: vmovx.f16 s16, s26 +; CHECK-NEXT: vins.f16 s14, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmovx.f16 s4, s17 +; CHECK-NEXT: vmov.f32 s26, s14 +; CHECK-NEXT: vins.f16 s20, s4 +; CHECK-NEXT: vmovx.f16 s4, s18 +; CHECK-NEXT: vins.f16 s22, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s23, s22 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] +; CHECK-NEXT: vmov.f32 s22, s18 +; CHECK-NEXT: vmovx.f16 s16, s21 +; CHECK-NEXT: vrev32.16 q1, q1 +; CHECK-NEXT: vmov.f32 s13, s25 +; CHECK-NEXT: vins.f16 s5, s16 +; CHECK-NEXT: vmovx.f16 s16, s6 +; CHECK-NEXT: vins.f16 s22, s16 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s6, s22 +; CHECK-NEXT: vmov.f32 s21, s5 +; CHECK-NEXT: vstrw.32 q0, [r1, #80] +; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vstrw.32 q3, [r1, #64] +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vstrw.32 q5, [r1, #16] +; CHECK-NEXT: add sp, #80 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -717,68 +672,63 @@ define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) { ; CHECK-LABEL: vst3_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.u16 q1, [r0, #8] -; CHECK-NEXT: vldrb.u16 q2, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} +; CHECK-NEXT: vldrb.u16 q1, [r0, #16] +; CHECK-NEXT: vldrb.u16 q2, [r0, #8] +; CHECK-NEXT: vmovx.f16 s12, s6 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vins.f16 s0, s12 +; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vins.f16 s1, s11 +; CHECK-NEXT: vmovx.f16 s3, s11 +; CHECK-NEXT: vins.f16 s3, s12 ; CHECK-NEXT: vldrb.u16 q3, [r0] -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vmovx.f16 s20, s1 ; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vins.f16 s17, s20 +; CHECK-NEXT: vmovx.f16 s20, s18 +; CHECK-NEXT: vins.f16 s2, s20 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.f32 s1, s17 +; CHECK-NEXT: vmov.f32 s2, s18 ; CHECK-NEXT: vmov.8 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.8 q4[1], r0 ; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vstrb.16 q0, [r1, #16] +; CHECK-NEXT: vmov.8 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vmov.8 q4[2], r0 ; CHECK-NEXT: vmov.u16 r0, q3[1] ; CHECK-NEXT: vmov.8 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.8 q4[4], r0 ; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] ; CHECK-NEXT: vmov.8 q4[5], r0 ; CHECK-NEXT: vmov.u16 r0, q3[2] ; CHECK-NEXT: vmov.8 q4[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.8 q4[7], r0 ; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.8 q4[7], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] ; CHECK-NEXT: vmov.8 q4[8], r0 ; CHECK-NEXT: vmov.u16 r0, q3[3] ; CHECK-NEXT: vmov.8 q4[9], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.8 q4[10], r0 ; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.8 q4[10], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.8 q4[11], r0 ; CHECK-NEXT: vmov.u16 r0, q3[4] ; CHECK-NEXT: vmov.8 q4[12], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.8 q4[13], r0 ; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.f32 s1, s21 +; CHECK-NEXT: vmov.8 q4[13], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.f32 s2, s22 ; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vstrb.16 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0 @@ -1355,19 +1305,19 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldmia r0, {s0, s1} ; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmovx.f16 s6, s8 -; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vmovx.f16 s14, s1 -; CHECK-NEXT: vins.f16 s14, s6 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: stm r1!, {r0, r2, r3} +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vmovx.f16 s10, s4 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vins.f16 s2, s10 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: str r0, [r1, #8] +; CHECK-NEXT: strd r3, r2, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 @@ -1388,6 +1338,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: ldrd r2, r12, [r0] ; CHECK-NEXT: ldrd r3, lr, [r0, #8] ; CHECK-NEXT: vmov.32 q0[0], r2 @@ -1397,28 +1349,29 @@ ; CHECK-NEXT: vmov.32 q1[1], lr ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmovx.f16 s10, s0 +; CHECK-NEXT: vmov.f32 s8, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s8, s5 ; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmovx.f16 s10, s2 -; CHECK-NEXT: vins.f16 s10, s12 -; CHECK-NEXT: vmovx.f16 s12, s1 -; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmovx.f16 s1, s3 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vins.f16 s5, s12 -; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: strd r2, r0, [r1, #16] +; CHECK-NEXT: vins.f16 s4, s10 +; CHECK-NEXT: vins.f16 s2, s12 +; CHECK-NEXT: vmovx.f16 s10, s1 +; CHECK-NEXT: vmovx.f16 s12, s5 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vins.f16 s5, s10 +; CHECK-NEXT: vins.f16 s17, s12 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s3, s8 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: strd r0, r2, [r1, #16] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 @@ -1439,60 +1392,61 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s20, s12 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vldrw.u32 q5, [r0, #16] ; CHECK-NEXT: vmov.f64 d0, d6 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmovx.f16 s6, s20 +; CHECK-NEXT: vmovx.f16 s8, s12 +; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vins.f16 s0, s20 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vins.f16 s4, s21 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vins.f16 s3, s9 -; CHECK-NEXT: vmovx.f16 s20, s18 +; CHECK-NEXT: vmov.f32 s3, s4 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vins.f16 s20, s24 +; CHECK-NEXT: vmov.f32 s17, s4 ; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vins.f16 s21, s11 +; CHECK-NEXT: vmov.f32 s18, s4 +; CHECK-NEXT: vins.f16 s17, s8 +; CHECK-NEXT: vmovx.f16 s8, s18 +; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s11, s23 +; CHECK-NEXT: vins.f16 s11, s24 +; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vmovx.f16 s8, s22 ; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmovx.f16 s23, s11 -; CHECK-NEXT: vrev32.16 q2, q2 -; CHECK-NEXT: vins.f16 s23, s24 +; CHECK-NEXT: vins.f16 s8, s24 ; CHECK-NEXT: vmov.f32 s25, s15 -; CHECK-NEXT: vmov.f32 s22, s7 -; CHECK-NEXT: vmovx.f16 s28, s21 +; CHECK-NEXT: vins.f16 s9, s23 ; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmovx.f16 s28, s9 ; CHECK-NEXT: vins.f16 s25, s28 ; CHECK-NEXT: vmovx.f16 s28, s26 -; CHECK-NEXT: vins.f16 s22, s28 +; CHECK-NEXT: vins.f16 s10, s28 ; CHECK-NEXT: vmovx.f16 s28, s13 ; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vrev32.16 q5, q5 ; CHECK-NEXT: vins.f16 s4, s28 ; CHECK-NEXT: vmovx.f16 s28, s14 ; CHECK-NEXT: vins.f16 s6, s28 -; CHECK-NEXT: vmov.f32 s26, s22 +; CHECK-NEXT: vmov.f32 s26, s10 ; CHECK-NEXT: vmov.f32 s7, s6 ; CHECK-NEXT: vmov.f32 s6, s14 ; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vins.f16 s9, s12 -; CHECK-NEXT: vmovx.f16 s12, s10 +; CHECK-NEXT: vins.f16 s21, s12 +; CHECK-NEXT: vmovx.f16 s12, s22 ; CHECK-NEXT: vins.f16 s6, s12 ; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s21, s25 -; CHECK-NEXT: vmov.f32 s5, s9 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s9, s25 +; CHECK-NEXT: vmov.f32 s5, s21 ; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov.f32 s22, s26 +; CHECK-NEXT: vmov.f32 s10, s26 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] +; CHECK-NEXT: vmov.f32 s6, s22 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEXT: bx lr @@ -1515,147 +1469,147 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #160 -; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s4, s18 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vins.f16 s5, s19 -; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vmovx.f16 s7, s19 -; CHECK-NEXT: vmov.f64 d14, d6 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: .pad #128 +; CHECK-NEXT: sub sp, #128 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q7, [r0, #64] +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] +; CHECK-NEXT: vmovx.f16 s0, s31 +; CHECK-NEXT: vmovx.f16 s11, s7 +; CHECK-NEXT: vins.f16 s11, s0 +; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vins.f16 s8, s0 ; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s9, s7 +; CHECK-NEXT: vmov.f32 s10, s31 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s5, s11 +; CHECK-NEXT: vmov q6, q2 ; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov.f32 s9, s15 -; CHECK-NEXT: vins.f16 s28, s16 -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vins.f16 s9, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vstrw.32 q2, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s9, s24 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vins.f16 s14, s0 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vins.f16 s12, s8 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vins.f16 s0, s9 +; CHECK-NEXT: vmov.16 q3[4], r2 +; CHECK-NEXT: vmovx.f16 s2, s16 +; CHECK-NEXT: vmov.f32 s15, s0 ; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.f64 d0, d10 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmov.f32 s10, s24 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.f32 s3, s21 -; CHECK-NEXT: vins.f16 s3, s5 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vins.f16 s9, s0 +; CHECK-NEXT: vmov.f32 s13, s4 ; CHECK-NEXT: vmov.f32 s5, s20 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s31, s13 -; CHECK-NEXT: vmovx.f16 s0, s12 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vins.f16 s31, s17 -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vins.f16 s9, s0 -; CHECK-NEXT: vmov.f32 s29, s12 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vmovx.f16 s12, s26 -; CHECK-NEXT: vins.f16 s30, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s30 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vins.f16 s8, s12 -; CHECK-NEXT: vmovx.f16 s12, s27 -; CHECK-NEXT: vins.f16 s9, s3 -; CHECK-NEXT: vmovx.f16 s11, s3 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vins.f16 s11, s12 -; CHECK-NEXT: vmov.f32 s13, s23 -; CHECK-NEXT: vmov.f32 s10, s27 +; CHECK-NEXT: vmov.f32 s6, s20 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q6 +; CHECK-NEXT: vins.f16 s14, s0 +; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vins.f16 s24, s16 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vins.f16 s0, s17 +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov.f32 s27, s0 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov.f32 s25, s4 +; CHECK-NEXT: vmov.f32 s5, s28 +; CHECK-NEXT: vmov.f32 s6, s28 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s26, s0 +; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: vmovx.f16 s7, s11 +; CHECK-NEXT: vmov.f32 s28, s29 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vins.f16 s5, s11 +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s6, s23 +; CHECK-NEXT: vmovx.f16 s16, s5 +; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vins.f16 s1, s16 +; CHECK-NEXT: vmovx.f16 s16, s2 +; CHECK-NEXT: vins.f16 s6, s16 ; CHECK-NEXT: vmovx.f16 s16, s9 -; CHECK-NEXT: vmov.f32 s14, s23 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vins.f16 s13, s16 -; CHECK-NEXT: vmovx.f16 s16, s14 -; CHECK-NEXT: vins.f16 s10, s16 ; CHECK-NEXT: vmov.f32 s20, s21 -; CHECK-NEXT: vmov.f32 s14, s10 -; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s24, s25 -; CHECK-NEXT: vmovx.f16 s16, s13 -; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: vins.f16 s20, s16 -; CHECK-NEXT: vmovx.f16 s16, s14 +; CHECK-NEXT: vmovx.f16 s16, s10 ; CHECK-NEXT: vins.f16 s22, s16 +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s23, s22 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vrev32.16 q2, q2 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s22, s18 ; CHECK-NEXT: vmovx.f16 s16, s21 -; CHECK-NEXT: vrev32.16 q1, q1 -; CHECK-NEXT: vins.f16 s5, s16 -; CHECK-NEXT: vmovx.f16 s16, s6 +; CHECK-NEXT: vins.f16 s9, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s14 +; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s16, s10 ; CHECK-NEXT: vins.f16 s22, s16 -; CHECK-NEXT: vmovx.f16 s16, s1 -; CHECK-NEXT: vins.f16 s24, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vins.f16 s28, s8 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vins.f16 s30, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s31, s30 +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s30, s18 +; CHECK-NEXT: vmovx.f16 s16, s29 +; CHECK-NEXT: vrev32.16 q0, q0 +; CHECK-NEXT: vstrw.32 q1, [r1, #80] +; CHECK-NEXT: vins.f16 s1, s16 ; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vins.f16 s26, s16 -; CHECK-NEXT: vmov.f32 s6, s22 -; CHECK-NEXT: vmov.f32 s27, s26 -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s4, s25 -; CHECK-NEXT: vrev32.16 q4, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s17, s4 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vins.f16 s26, s12 -; CHECK-NEXT: vldrw.u32 q3, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s13, s1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s1, s5 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vmov.f32 s30, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s30, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s30 ; CHECK-NEXT: vmov.f32 s25, s17 -; CHECK-NEXT: vstrw.32 q7, [r1, #48] -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vmov.f32 s22, s6 ; CHECK-NEXT: vmov.f32 s26, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s29, s1 +; CHECK-NEXT: vstrw.32 q6, [r1] +; CHECK-NEXT: vmov.f32 s13, s17 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vstrw.32 q3, [r1, #48] +; CHECK-NEXT: vmov.f32 s9, s17 +; CHECK-NEXT: vmov.f32 s30, s2 +; CHECK-NEXT: vstrw.32 q7, [r1, #16] +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] +; CHECK-NEXT: vmov.f32 s21, s17 +; CHECK-NEXT: vmov.f32 s22, s18 ; CHECK-NEXT: vstrw.32 q5, [r1, #64] -; CHECK-NEXT: vstrw.32 q6, [r1, #16] -; CHECK-NEXT: add sp, #160 +; CHECK-NEXT: add sp, #128 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-vst4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -422,65 +422,53 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f64 d0, d8 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vins.f16 s21, s21 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.f32 s12, s17 -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vins.f16 s12, s9 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.f32 s25, s4 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vins.f16 s25, s25 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.f32 s13, s21 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vmov.f32 s20, s19 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.f32 s1, s25 -; CHECK-NEXT: vins.f16 s20, s11 -; CHECK-NEXT: vmov.f32 s3, s27 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.f32 s25, s7 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vins.f16 s25, s25 -; CHECK-NEXT: vstrb.8 q3, [r1, #16] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vstrb.8 q0, [r1] -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.f32 s21, s25 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f64 d12, d9 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s2, s12 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vmovx.f16 s24, s11 +; CHECK-NEXT: vmovx.f16 s16, s13 +; CHECK-NEXT: vins.f16 s13, s13 +; CHECK-NEXT: vins.f16 s16, s16 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vmov.f32 s23, s16 +; CHECK-NEXT: vmovx.f16 s16, s8 +; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vmov.f32 s1, s21 +; CHECK-NEXT: vmov.f32 s3, s23 +; CHECK-NEXT: vmovx.f16 s20, s4 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vins.f16 s20, s16 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmovx.f16 s8, s10 +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vmovx.f16 s22, s7 +; CHECK-NEXT: vins.f16 s7, s11 +; CHECK-NEXT: vins.f16 s22, s24 +; CHECK-NEXT: vmovx.f16 s26, s6 +; CHECK-NEXT: vmovx.f16 s19, s12 +; CHECK-NEXT: vins.f16 s12, s12 +; CHECK-NEXT: vmov.f32 s20, s7 +; CHECK-NEXT: vins.f16 s6, s10 +; CHECK-NEXT: vins.f16 s26, s8 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmovx.f16 s23, s15 +; CHECK-NEXT: vins.f16 s15, s15 +; CHECK-NEXT: vmov.f32 s24, s6 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmovx.f16 s27, s14 +; CHECK-NEXT: vins.f16 s14, s14 +; CHECK-NEXT: vins.f16 s19, s19 +; CHECK-NEXT: vmov.f32 s25, s14 +; CHECK-NEXT: vins.f16 s23, s23 +; CHECK-NEXT: vins.f16 s27, s27 ; CHECK-NEXT: vstrb.8 q5, [r1, #48] -; CHECK-NEXT: vmov.f32 s17, s6 -; CHECK-NEXT: vins.f16 s24, s10 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vins.f16 s17, s17 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.f32 s25, s17 -; CHECK-NEXT: vmov.f32 s27, s19 ; CHECK-NEXT: vstrb.8 q6, [r1, #32] +; CHECK-NEXT: vstrb.8 q0, [r1, #16] +; CHECK-NEXT: vstrb.8 q4, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: @@ -1044,22 +1032,19 @@ define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vst4_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldmia r0, {s4, s5} -; CHECK-NEXT: vldr s0, [r0, #8] -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vmov.f32 s1, s0 -; CHECK-NEXT: vmovx.f16 s14, s0 -; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vldmia r0, {s0, s1} +; CHECK-NEXT: vldr s4, [r0, #8] +; CHECK-NEXT: vmovx.f16 s2, s0 ; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vins.f16 s12, s4 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vins.f16 s14, s0 -; CHECK-NEXT: vmov.f32 s11, s14 -; CHECK-NEXT: vstrh.16 q2, [r1] +; CHECK-NEXT: vmov.f32 s5, s4 +; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vins.f16 s3, s8 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 @@ -1082,40 +1067,40 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: vmov.32 q0[0], lr +; CHECK-NEXT: ldrd r2, r12, [r0] +; CHECK-NEXT: ldrd r3, lr, [r0, #8] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrd r2, r0, [r0, #16] ; CHECK-NEXT: vmov.32 q1[0], r3 ; CHECK-NEXT: vmov.32 q0[1], r12 -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: ldrd r2, r0, [r0, #16] +; CHECK-NEXT: vmov.32 q1[1], lr ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmovx.f16 s12, s3 -; CHECK-NEXT: vins.f16 s8, s5 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.f32 s6, s4 -; CHECK-NEXT: vmov.f32 s7, s5 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vins.f16 s9, s5 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vins.f16 s10, s12 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmovx.f16 s11, s5 -; CHECK-NEXT: vins.f16 s11, s12 -; CHECK-NEXT: vstrh.16 q2, [r1, #16] -; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s3, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmovx.f16 s10, s2 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vins.f16 s6, s8 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.f32 s10, s8 +; CHECK-NEXT: vmov.f32 s11, s9 +; CHECK-NEXT: vmovx.f16 s3, s8 ; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s4, s6 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vins.f16 s3, s4 +; CHECK-NEXT: vmovx.f16 s14, s10 +; CHECK-NEXT: vmovx.f16 s7, s9 +; CHECK-NEXT: vins.f16 s3, s14 +; CHECK-NEXT: vins.f16 s9, s11 +; CHECK-NEXT: vmovx.f16 s14, s11 +; CHECK-NEXT: vins.f16 s7, s14 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f32 s5, s9 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vstrh.16 q1, [r1, #16] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: pop {r7, pc} entry: @@ -1204,56 +1189,63 @@ define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) { ; CHECK-LABEL: vst4_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: vmovx.f16 s2, s21 -; CHECK-NEXT: vins.f16 s21, s17 -; CHECK-NEXT: vmov.f32 s0, s21 -; CHECK-NEXT: vmovx.f16 s4, s17 -; CHECK-NEXT: vins.f16 s2, s4 -; CHECK-NEXT: vmovx.f16 s8, s20 -; CHECK-NEXT: vmovx.f16 s4, s16 -; CHECK-NEXT: vins.f16 s20, s16 -; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmovx.f16 s10, s23 -; CHECK-NEXT: vins.f16 s23, s19 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmovx.f16 s30, s5 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vins.f16 s5, s17 +; CHECK-NEXT: vins.f16 s30, s8 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s8, s16 +; CHECK-NEXT: vins.f16 s4, s16 +; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vmovx.f16 s10, s7 ; CHECK-NEXT: vmovx.f16 s12, s19 -; CHECK-NEXT: vmov.f32 s8, s23 -; CHECK-NEXT: vmovx.f16 s16, s18 +; CHECK-NEXT: vins.f16 s7, s19 ; CHECK-NEXT: vins.f16 s10, s12 -; CHECK-NEXT: vmovx.f16 s14, s22 -; CHECK-NEXT: vins.f16 s22, s18 -; CHECK-NEXT: vmov.f32 s12, s22 +; CHECK-NEXT: vmovx.f16 s14, s6 +; CHECK-NEXT: vmovx.f16 s16, s18 +; CHECK-NEXT: vins.f16 s6, s18 ; CHECK-NEXT: vins.f16 s14, s16 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f32 s13, s18 +; CHECK-NEXT: vstr s0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vmov.f32 s28, s5 ; CHECK-NEXT: vmovx.f16 s24, s17 ; CHECK-NEXT: vins.f16 s17, s17 -; CHECK-NEXT: vins.f16 s24, s24 -; CHECK-NEXT: vmov q5, q4 -; CHECK-NEXT: vmovx.f16 s7, s16 -; CHECK-NEXT: vmov.f32 s23, s24 +; CHECK-NEXT: vmovx.f16 s23, s16 ; CHECK-NEXT: vins.f16 s16, s16 -; CHECK-NEXT: vmov.f32 s5, s16 ; CHECK-NEXT: vmovx.f16 s11, s19 ; CHECK-NEXT: vins.f16 s19, s19 -; CHECK-NEXT: vins.f16 s13, s13 -; CHECK-NEXT: vmov.f32 s9, s19 ; CHECK-NEXT: vmovx.f16 s15, s18 -; CHECK-NEXT: vmov.f32 s1, s21 -; CHECK-NEXT: vins.f16 s7, s7 +; CHECK-NEXT: vins.f16 s18, s18 +; CHECK-NEXT: vins.f16 s24, s24 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s3, s24 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vmov.f32 s29, s1 ; CHECK-NEXT: vins.f16 s11, s11 ; CHECK-NEXT: vins.f16 s15, s15 +; CHECK-NEXT: vins.f16 s23, s23 +; CHECK-NEXT: vmov.f32 s8, s7 +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.f32 s9, s19 +; CHECK-NEXT: vmov.f32 s13, s18 ; CHECK-NEXT: vstrb.8 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s31, s3 +; CHECK-NEXT: vldr s0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov.f32 s21, s16 ; CHECK-NEXT: vstrb.8 q3, [r1, #32] -; CHECK-NEXT: vmov.f32 s3, s23 -; CHECK-NEXT: vstrb.8 q1, [r1] -; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vmov.f32 s26, s0 +; CHECK-NEXT: vstrb.8 q7, [r1, #16] +; CHECK-NEXT: vmov.f32 s25, s16 +; CHECK-NEXT: vmov.f32 s27, s23 +; CHECK-NEXT: vstrb.8 q6, [r1] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0