Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -208,6 +208,9 @@ VQMOVNs, // Vector (V) Saturating (Q) Move and Narrow (N), signed (s) VQMOVNu, // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u) + // MVE float <> half converts + VCVTN, // MVE vcvt f32 -> f16, truncating into either the bottom or top lanes + // Vector multiply long: VMULLs, // ...signed VMULLu, // ...unsigned Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1676,6 +1676,7 @@ case ARMISD::VMOVN: return "ARMISD::VMOVN"; case ARMISD::VQMOVNs: return "ARMISD::VQMOVNs"; case ARMISD::VQMOVNu: return "ARMISD::VQMOVNu"; + case ARMISD::VCVTN: return "ARMISD::VCVTN"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; case ARMISD::VADDVs: return "ARMISD::VADDVs"; @@ -7122,6 +7123,60 @@ return true; } +// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extract +// from a pair of inputs. For example: +// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), +// FP_ROUND(EXTRACT_ELT(Y, 0), +// FP_ROUND(EXTRACT_ELT(X, 1), +// FP_ROUND(EXTRACT_ELT(Y, 1), ...) +static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, + const ARMSubtarget *ST) { + assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + if (!ST->hasMVEFloatOps()) + return SDValue(); + + SDLoc dl(BV); + EVT VT = BV.getValueType(); + if (VT != MVT::v8f16) + return SDValue(); + + // We are looking for a buildvector of fptrunc elements, where all the + // elements are interleavingly extracted from two sources. Check the first two + // items are valid enough and extract some info from them (they are checked + // properly in the loop below). + if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND || + BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || + BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0) + return SDValue(); + if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND || + BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || + BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0) + return SDValue(); + SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); + SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0); + if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32) + return SDValue(); + + // Check all the values in the BuildVector line up with our expectations. + for (int i = 1; i < 4; i++) { + auto Check = [](SDValue Trunc, SDValue Op, int Idx) { + return Trunc.getOpcode() == ISD::FP_ROUND && + Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Trunc.getOperand(0).getOperand(0) == Op && + Trunc.getOperand(0).getConstantOperandVal(1) == Idx; + }; + if (!Check(BV.getOperand(i * 2 + 0), Op0, i)) + return SDValue(); + if (!Check(BV.getOperand(i * 2 + 1), Op1, i)) + return SDValue(); + } + + SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0, + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1, + DAG.getConstant(1, dl, MVT::i32)); +} + // If N is an integer constant that can be moved into a register in one // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. @@ -7377,12 +7432,16 @@ if (isConstant) return SDValue(); - // Empirical tests suggest this is rarely worth it for vectors of length <= 2. - if (NumElts >= 4) { - SDValue shuffle = ReconstructShuffle(Op, DAG); - if (shuffle != SDValue()) + // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and + // vmovn). Empirical tests suggest this is rarely worth it for vectors of + // length <= 2. + if (NumElts >= 4) + if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; - } + + // Attempt to turn a buildvector of scalar fptrunc's back into VCVT's + if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget)) + return VCVT; if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { // If we haven't found an efficient lowering, try splitting a 128-bit vector Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4772,6 +4772,8 @@ let retainsPreviousHalfElement = 1; } +def MVEvcvtn : SDNode<"ARMISD::VCVTN", SDTARMVMOVNQ>; + multiclass MVE_VCVT_f2h_m { def "": MVE_VCVT_ff; @@ -4786,6 +4788,9 @@ (v4i1 VCCR:$mask))), (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), ARMVCCThen, (v4i1 VCCR:$mask)))>; + + def : Pat<(v8f16 (MVEvcvtn (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))), + (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>; } } Index: llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll +++ llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll @@ -726,46 +726,24 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q2, [r0], #16 -; CHECK-NEXT: vcvtb.f32.f16 s7, s11 -; CHECK-NEXT: vmovx.f16 s13, s11 -; CHECK-NEXT: vcvtb.f32.f16 s6, s10 -; CHECK-NEXT: vmovx.f16 s14, s8 -; CHECK-NEXT: vcvtb.f32.f16 s5, s9 -; CHECK-NEXT: vcvtb.f32.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s10 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s4 -; CHECK-NEXT: vcvtb.f32.f16 s19, s13 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vcvtb.f32.f16 s18, s8 -; CHECK-NEXT: vcvtb.f32.f16 s17, s12 -; CHECK-NEXT: vcvtb.f32.f16 s16, s14 -; CHECK-NEXT: vmul.f32 q2, q4, q0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s8 -; CHECK-NEXT: vcvtb.f16.f32 s16, s5 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q3[1], r3 -; CHECK-NEXT: vcvtb.f16.f32 s16, s9 -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s6 -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s10 -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q3[5], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s11 -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vstrb.8 q3, [r1], #16 +; CHECK-NEXT: vldrh.u16 q1, [r0], #16 +; CHECK-NEXT: vmovx.f16 s8, s7 +; CHECK-NEXT: vmovx.f16 s14, s6 +; CHECK-NEXT: vcvtb.f32.f16 s11, s8 +; CHECK-NEXT: vmovx.f16 s13, s5 +; CHECK-NEXT: vcvtb.f32.f16 s10, s14 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vcvtb.f32.f16 s9, s13 +; CHECK-NEXT: vcvtb.f32.f16 s19, s7 +; CHECK-NEXT: vcvtb.f32.f16 s18, s6 +; CHECK-NEXT: vcvtb.f32.f16 s17, s5 +; CHECK-NEXT: vcvtb.f32.f16 s16, s4 +; CHECK-NEXT: vcvtb.f32.f16 s8, s12 +; CHECK-NEXT: vmul.f32 q1, q4, q0 +; CHECK-NEXT: vmul.f32 q2, q2, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q1, q2 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9} @@ -817,85 +795,41 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q2, [r0] -; CHECK-NEXT: vcvtb.f32.f16 s7, s11 -; CHECK-NEXT: vmovx.f16 s13, s11 -; CHECK-NEXT: vcvtb.f32.f16 s6, s10 -; CHECK-NEXT: vmovx.f16 s14, s8 -; CHECK-NEXT: vcvtb.f32.f16 s5, s9 -; CHECK-NEXT: vcvtb.f32.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s10 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s4 -; CHECK-NEXT: vcvtb.f32.f16 s19, s13 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vcvtb.f32.f16 s18, s8 -; CHECK-NEXT: vcvtb.f32.f16 s17, s12 -; CHECK-NEXT: vcvtb.f32.f16 s16, s14 -; CHECK-NEXT: vmul.f32 q2, q4, q0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s8 -; CHECK-NEXT: vcvtb.f16.f32 s16, s5 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q3[1], r3 -; CHECK-NEXT: vcvtb.f16.f32 s16, s9 -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s6 -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s10 -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q3[5], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s11 -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vldrh.u16 q2, [r0, #16]! -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vstrh.16 q3, [r1] -; CHECK-NEXT: vmovx.f16 s12, s11 -; CHECK-NEXT: vmovx.f16 s14, s10 +; CHECK-NEXT: vldrh.u16 q1, [r0] +; CHECK-NEXT: vmovx.f16 s8, s7 +; CHECK-NEXT: vmovx.f16 s14, s6 +; CHECK-NEXT: vcvtb.f32.f16 s11, s8 +; CHECK-NEXT: vmovx.f16 s13, s5 +; CHECK-NEXT: vcvtb.f32.f16 s10, s14 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vcvtb.f32.f16 s9, s13 +; CHECK-NEXT: vcvtb.f32.f16 s19, s7 +; CHECK-NEXT: vcvtb.f32.f16 s18, s6 +; CHECK-NEXT: vcvtb.f32.f16 s17, s5 +; CHECK-NEXT: vcvtb.f32.f16 s16, s4 +; CHECK-NEXT: vcvtb.f32.f16 s8, s12 +; CHECK-NEXT: vmul.f32 q1, q4, q0 +; CHECK-NEXT: vmul.f32 q2, q2, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q1, q2 +; CHECK-NEXT: vstrh.16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r0, #16]! +; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmovx.f16 s14, s6 ; CHECK-NEXT: vcvtb.f32.f16 s19, s12 -; CHECK-NEXT: vmovx.f16 s4, s9 +; CHECK-NEXT: vmovx.f16 s8, s5 ; CHECK-NEXT: vcvtb.f32.f16 s18, s14 -; CHECK-NEXT: vmovx.f16 s6, s8 -; CHECK-NEXT: vcvtb.f32.f16 s17, s4 -; CHECK-NEXT: vcvtb.f32.f16 s16, s6 +; CHECK-NEXT: vmovx.f16 s10, s4 +; CHECK-NEXT: vcvtb.f32.f16 s17, s8 +; CHECK-NEXT: vcvtb.f32.f16 s16, s10 +; CHECK-NEXT: vcvtb.f32.f16 s11, s7 +; CHECK-NEXT: vcvtb.f32.f16 s10, s6 +; CHECK-NEXT: vcvtb.f32.f16 s9, s5 +; CHECK-NEXT: vcvtb.f32.f16 s8, s4 ; CHECK-NEXT: vmul.f32 q1, q4, q0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s4 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f32.f16 s15, s11 -; CHECK-NEXT: vcvtb.f32.f16 s14, s10 -; CHECK-NEXT: vcvtb.f32.f16 s13, s9 -; CHECK-NEXT: vcvtb.f32.f16 s12, s8 -; CHECK-NEXT: vmul.f32 q3, q3, q0 -; CHECK-NEXT: vcvtb.f16.f32 s8, s12 -; CHECK-NEXT: vcvtb.f16.f32 s16, s13 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s5 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s14 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s6 -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f16.f32 s12, s15 -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q2[6], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmul.f32 q2, q2, q0 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vcvtt.f16.f32 q2, q1 ; CHECK-NEXT: vstrb.8 q2, [r1, #16]! ; CHECK-NEXT: le lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup Index: llvm/test/CodeGen/Thumb2/mve-vcvt16.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -100,31 +100,8 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_trunc1(<4 x float> %src1, <4 x float> %src2) { ; CHECK-LABEL: shuffle_trunc1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s9 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s5 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vcvtt.f16.f32 q0, q1 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> @@ -135,31 +112,9 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_trunc2(<4 x float> %src1, <4 x float> %src2) { ; CHECK-LABEL: shuffle_trunc2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 s8, s4 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s5 -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s1 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s6 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s2 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vcvtb.f16.f32 s0, s3 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> @@ -170,59 +125,10 @@ define arm_aapcs_vfpcc <16 x half> @shuffle_trunc3(<8 x float> %src1, <8 x float> %src2) { ; CHECK-LABEL: shuffle_trunc3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s16 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s8 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s17 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s9 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s18 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s10 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s16, s19 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s12 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s16, s5 -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s13 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s6 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s14 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s15 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q0, q2 +; CHECK-NEXT: vcvtt.f16.f32 q1, q3 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> @@ -233,59 +139,12 @@ define arm_aapcs_vfpcc <16 x half> @shuffle_trunc4(<8 x float> %src1, <8 x float> %src2) { ; CHECK-LABEL: shuffle_trunc4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s16 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s9 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s17 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s10 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s18 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s19 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s12 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s4 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s16, s13 -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s5 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s14 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s6 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s12, s15 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vcvtb.f16.f32 q3, q3 +; CHECK-NEXT: vcvtt.f16.f32 q2, q0 +; CHECK-NEXT: vcvtt.f16.f32 q3, q1 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> @@ -296,31 +155,8 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_trunc5(<4 x float> %src1, <4 x float> %src2) { ; CHECK-LABEL: shuffle_trunc5: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s9 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s5 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vcvtt.f16.f32 q0, q1 ; CHECK-NEXT: bx lr entry: %out1 = fptrunc <4 x float> %src1 to <4 x half> @@ -332,31 +168,9 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_trunc6(<4 x float> %src1, <4 x float> %src2) { ; CHECK-LABEL: shuffle_trunc6: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 s8, s4 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s5 -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s1 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s6 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s2 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vcvtb.f16.f32 s0, s3 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out1 = fptrunc <4 x float> %src1 to <4 x half> @@ -368,59 +182,10 @@ define arm_aapcs_vfpcc <16 x half> @shuffle_trunc7(<8 x float> %src1, <8 x float> %src2) { ; CHECK-LABEL: shuffle_trunc7: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s16 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s8 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s17 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s9 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s18 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s10 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s16, s19 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s12 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s16, s5 -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s13 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s6 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s14 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s15 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q0, q2 +; CHECK-NEXT: vcvtt.f16.f32 q1, q3 ; CHECK-NEXT: bx lr entry: %out1 = fptrunc <8 x float> %src1 to <8 x half> @@ -432,59 +197,12 @@ define arm_aapcs_vfpcc <16 x half> @shuffle_trunc8(<8 x float> %src1, <8 x float> %src2) { ; CHECK-LABEL: shuffle_trunc8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s16 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s9 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s17 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s10 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s18 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s19 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s12 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s4 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s16, s13 -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s5 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s14 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s6 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vcvtb.f16.f32 s12, s15 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vcvtb.f16.f32 q3, q3 +; CHECK-NEXT: vcvtt.f16.f32 q2, q0 +; CHECK-NEXT: vcvtt.f16.f32 q3, q1 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: bx lr entry: %out1 = fptrunc <8 x float> %src1 to <8 x half> @@ -750,31 +468,9 @@ define arm_aapcs_vfpcc void @store_shuffletrunc_8(<8 x half>* %src, <4 x float> %val1, <4 x float> %val2) { ; CHECK-LABEL: store_shuffletrunc_8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 s8, s0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s4 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vcvtb.f16.f32 s12, s1 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s5 -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s2 -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s6 -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvtb.f16.f32 s0, s3 -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s7 -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vcvtt.f16.f32 q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %val1, <4 x float> %val2, <8 x i32> @@ -786,59 +482,12 @@ define arm_aapcs_vfpcc void @store_shuffletrunc_16(<16 x half>* %src, <8 x float> %val1, <8 x float> %val2) { ; CHECK-LABEL: store_shuffletrunc_16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vcvtb.f16.f32 s16, s4 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s12 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q4[0], r1 -; CHECK-NEXT: vcvtb.f16.f32 s20, s5 -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s13 -; CHECK-NEXT: vmov.16 q4[2], r1 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s6 -; CHECK-NEXT: vmov.16 q4[3], r1 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: vcvtb.f16.f32 s20, s14 -; CHECK-NEXT: vmov.16 q4[4], r1 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q4[5], r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s15 -; CHECK-NEXT: vmov.16 q4[6], r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.16 q4[7], r1 -; CHECK-NEXT: vstrw.32 q4, [r0, #16] -; CHECK-NEXT: vcvtb.f16.f32 s4, s8 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vcvtb.f16.f32 s12, s1 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s9 -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s2 -; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvtb.f16.f32 s0, s3 -; CHECK-NEXT: vmov.16 q1[5], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s11 -; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q1[7], r1 -; CHECK-NEXT: vstrw.32 q1, [r0] -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vcvtt.f16.f32 q1, q3 +; CHECK-NEXT: vcvtt.f16.f32 q0, q2 +; CHECK-NEXT: vstrw.32 q1, [r0, #16] +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x float> %val1, <8 x float> %val2, <16 x i32>