Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -759,9 +759,7 @@ addAllExtLoads(VT, InnerVT, Expand); } - setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); @@ -940,6 +938,11 @@ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + } + // NEON only has FMA instructions as of VFP4. if (!Subtarget->hasVFP4Base()) { setOperationAction(ISD::FMA, MVT::v2f32, Expand); Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4707,6 +4707,15 @@ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), ARMVCCThen, (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))>; + + // ISD::MULHU/MULHS + if !eq(round, 0b0) then { + defvar mulh = !if(VTI.Unsigned, mulhu, mulhs); + let Predicates = [HasMVEInt] in { + def : Pat<(VTI.Vec (mulh (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b))), + (VTI.Vec (Inst MQPR:$a, MQPR:$b))>; + } + } } } Index: llvm/test/CodeGen/Thumb2/mve-vmulh.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmulh.ll +++ llvm/test/CodeGen/Thumb2/mve-vmulh.ll @@ -48,28 +48,7 @@ define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) { ; CHECK-LABEL: vmulhs_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s12, s0 -; CHECK-NEXT: vmov.f32 s14, s1 -; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmullb.s32 q0, q1, q3 -; CHECK-NEXT: smmul r0, r1, r0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: smmul r1, r2, r1 -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmulh.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <4 x i32> %s0 to <4 x i64> @@ -83,21 +62,7 @@ define arm_aapcs_vfpcc <4 x i32> @vmulhu_v4i32(<4 x i32> %s0, <4 x i32> %s1) { ; CHECK-LABEL: vmulhu_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s16, s2 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov.f32 s18, s3 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmullb.u32 q2, q4, q3 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmullb.u32 q3, q0, q1 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vmov.f32 s1, s15 -; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmulh.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <4 x i32> %s0 to <4 x i64> @@ -141,59 +106,7 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vmulhs_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmullb.s16 q2, q3, q2 -; CHECK-NEXT: vshr.s32 q3, q2, #16 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vmullb.s16 q0, q1, q3 -; CHECK-NEXT: vshr.s32 q0, q0, #16 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmulh.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <8 x i16> %s0 to <8 x i32> @@ -207,59 +120,7 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vmulhu_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmullb.u16 q2, q3, q2 -; CHECK-NEXT: vshr.u32 q3, q2, #16 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vmullb.u16 q0, q1, q3 -; CHECK-NEXT: vshr.u32 q0, q0, #16 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmulh.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <8 x i16> %s0 to <8 x i32> @@ -303,107 +164,7 @@ define arm_aapcs_vfpcc <16 x i8> @vmulhs_v16i8(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: vmulhs_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmullb.s8 q2, q3, q2 -; CHECK-NEXT: vshr.s16 q3, q2, #8 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.8 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.8 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.8 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.8 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.8 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.8 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.8 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.8 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmullb.s8 q0, q1, q3 -; CHECK-NEXT: vshr.s16 q0, q0, #8 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.8 q2[8], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.8 q2[9], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.8 q2[10], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.8 q2[11], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.8 q2[12], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.8 q2[13], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.8 q2[14], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.8 q2[15], r0 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmulh.s8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <16 x i8> %s0 to <16 x i16> @@ -417,107 +178,7 @@ define arm_aapcs_vfpcc <16 x i8> @vmulhu_v16i8(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: vmulhu_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmullb.u8 q2, q3, q2 -; CHECK-NEXT: vshr.u16 q3, q2, #8 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.8 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.8 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.8 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.8 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.8 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.8 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.8 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.8 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmullb.u8 q0, q1, q3 -; CHECK-NEXT: vshr.u16 q0, q0, #8 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.8 q2[8], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.8 q2[9], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.8 q2[10], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.8 q2[11], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.8 q2[12], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.8 q2[13], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.8 q2[14], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.8 q2[15], r0 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmulh.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <16 x i8> %s0 to <16 x i16>