Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -12125,18 +12125,86 @@ DAG.getNode(ISD::MUL, DL, VT, N01, N1)); } +static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (VT != MVT::v2i64) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + auto IsSignExt = [&](SDValue Op) { + if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG) + return SDValue(); + EVT VT = cast(Op->getOperand(1))->getVT(); + if (VT.getScalarSizeInBits() == 32) + return Op->getOperand(0); + return SDValue(); + }; + auto IsZeroExt = [&](SDValue Op) { + // Zero extends are a little more awkward. At the point we are matching + // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask. + // That might be before of after a bitcast depending on how the and is + // placed. Because this has to look through bitcasts, it is currently only + // supported on LE. + if (!Subtarget->isLittle()) + return SDValue(); + + SDValue And = Op; + if (And->getOpcode() == ISD::BITCAST) + And = And->getOperand(0); + if (And->getOpcode() != ISD::AND) + return SDValue(); + SDValue Mask = And->getOperand(1); + if (Mask->getOpcode() == ISD::BITCAST) + Mask = Mask->getOperand(0); + + if (Mask->getOpcode() != ISD::BUILD_VECTOR || + Mask.getValueType() != MVT::v4i32) + return SDValue(); + if (isAllOnesConstant(Mask->getOperand(0)) && + isNullConstant(Mask->getOperand(1)) && + isAllOnesConstant(Mask->getOperand(2)) && + isNullConstant(Mask->getOperand(3))) + return And->getOperand(0); + return SDValue(); + }; + + SDLoc dl(N); + if (SDValue Op0 = IsSignExt(N0)) { + if (SDValue Op1 = IsSignExt(N1)) { + SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); + SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); + return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a); + } + } + if (SDValue Op0 = IsZeroExt(N0)) { + if (SDValue Op1 = IsZeroExt(N1)) { + SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); + SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); + return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a); + } + } + + return SDValue(); +} + static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64) + return PerformMVEVMULLCombine(N, DAG, Subtarget); + if (Subtarget->isThumb1Only()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); - EVT VT = N->getValueType(0); if (VT.is64BitVector() || VT.is128BitVector()) return PerformVMULCombine(N, DCI, Subtarget); if (VT != MVT::i32) Index: llvm/lib/Target/ARM/ARMInstrInfo.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrInfo.td +++ llvm/lib/Target/ARM/ARMInstrInfo.td @@ -289,6 +289,11 @@ def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>; def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>; +def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisSameAs<1, 2>]>; +def ARMvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>; +def ARMvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>; + def SDTARMVCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisInt<3>]>; def SDTARMVCMPZ : SDTypeProfile<1, 2, [SDTCisInt<2>]>; Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2540,22 +2540,23 @@ def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i8), (MVE_VMOVLs16bh (MVE_VMOVLs8bh MQPR:$src))>; - def : Pat<(sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), - v8i8), (MVE_VMOVLs8th MQPR:$src)>; - def : Pat<(sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), - v4i16), (MVE_VMOVLs16th MQPR:$src)>; - def : Pat<(ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), - (i32 0xAFF)), (MVE_VMOVLu8th MQPR:$src)>; - def : Pat<(and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), - (v4i32 (ARMvmovImm (i32 0xCFF)))), - (MVE_VMOVLu16th MQPR:$src)>; + def : Pat<(sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), v8i8), + (MVE_VMOVLs8th MQPR:$src)>; + def : Pat<(sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), v4i16), + (MVE_VMOVLs16th MQPR:$src)>; - // zext_inreg 16 -> 32 - def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))), - (MVE_VMOVLu16bh MQPR:$src)>; // zext_inreg 8 -> 16 def : Pat<(ARMvbicImm (v8i16 MQPR:$src), (i32 0xAFF)), (MVE_VMOVLu8bh MQPR:$src)>; + // zext_inreg 16 -> 32 + def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))), + (MVE_VMOVLu16bh MQPR:$src)>; + // Same zext_inreg with vrevs, picking the top half + def : Pat<(ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), (i32 0xAFF)), + (MVE_VMOVLu8th MQPR:$src)>; + def : Pat<(and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), + (v4i32 (ARMvmovImm (i32 0xCFF)))), + (MVE_VMOVLu16th MQPR:$src)>; } @@ -4418,6 +4419,50 @@ defm MVE_VMULLTp16 : MVE_VMULL_m; +let Predicates = [HasMVEInt] in { + def : Pat<(v2i64 (ARMvmulls (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))), + (MVE_VMULLBs32 MQPR:$src1, MQPR:$src2)>; + def : Pat<(v2i64 (ARMvmulls (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))), + (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))), + (MVE_VMULLTs32 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (sext_inreg (v4i32 MQPR:$src1), v4i16), + (sext_inreg (v4i32 MQPR:$src2), v4i16)), + (MVE_VMULLBs16 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))), v4i16), + (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))), v4i16)), + (MVE_VMULLTs16 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (sext_inreg (v8i16 MQPR:$src1), v8i8), + (sext_inreg (v8i16 MQPR:$src2), v8i8)), + (MVE_VMULLBs8 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), v8i8), + (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), v8i8)), + (MVE_VMULLTs8 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(v2i64 (ARMvmullu (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))), + (MVE_VMULLBu32 MQPR:$src1, MQPR:$src2)>; + def : Pat<(v2i64 (ARMvmullu (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))), + (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))), + (MVE_VMULLTu32 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (and (v4i32 MQPR:$src1), (v4i32 (ARMvmovImm (i32 0xCFF)))), + (and (v4i32 MQPR:$src2), (v4i32 (ARMvmovImm (i32 0xCFF))))), + (MVE_VMULLBu16 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))), + (v4i32 (ARMvmovImm (i32 0xCFF)))), + (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))), + (v4i32 (ARMvmovImm (i32 0xCFF))))), + (MVE_VMULLTu16 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (ARMvbicImm (v8i16 MQPR:$src1), (i32 0xAFF)), + (ARMvbicImm (v8i16 MQPR:$src2), (i32 0xAFF))), + (MVE_VMULLBu8 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), (i32 0xAFF)), + (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), (i32 0xAFF))), + (MVE_VMULLTu8 MQPR:$src1, MQPR:$src2)>; +} + class MVE_VxMULH size, bit round, list pattern=[]> : MVE_qDest_qSrc; def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>; -def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, - SDTCisSameAs<1, 2>]>; -def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>; -def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>; - def SDTARMVTBL1 : SDTypeProfile<1, 2, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>, SDTCisVT<2, v8i8>]>; def SDTARMVTBL2 : SDTypeProfile<1, 3, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>, @@ -4428,17 +4423,17 @@ let PostEncoderMethod = "NEONThumb2DataIPostEncoder", DecoderNamespace = "NEONData" in { defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, - "vmull", "s", NEONvmulls, 1>; + "vmull", "s", ARMvmulls, 1>; defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, - "vmull", "u", NEONvmullu, 1>; + "vmull", "u", ARMvmullu, 1>; def VMULLp8 : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8", v8i16, v8i8, int_arm_neon_vmullp, 1>; def VMULLp64 : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary, "vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>, Requires<[HasV8, HasCrypto]>; } -defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>; -defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>; +defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", ARMvmulls>; +defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", ARMvmullu>; // VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D) defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D, @@ -4508,12 +4503,12 @@ // VMLAL : Vector Multiply Accumulate Long (Q += D * D) defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlal", "s", NEONvmulls, add>; + "vmlal", "s", ARMvmulls, add>; defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlal", "u", NEONvmullu, add>; + "vmlal", "u", ARMvmullu, add>; -defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>; -defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>; +defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", ARMvmulls, add>; +defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", ARMvmullu, add>; let Predicates = [HasNEON, HasV8_1a] in { // v8.1a Neon Rounding Double Multiply-Op vector operations, @@ -4741,12 +4736,12 @@ // VMLSL : Vector Multiply Subtract Long (Q -= D * D) defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlsl", "s", NEONvmulls, sub>; + "vmlsl", "s", ARMvmulls, sub>; defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlsl", "u", NEONvmullu, sub>; + "vmlsl", "u", ARMvmullu, sub>; -defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>; -defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>; +defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", ARMvmulls, sub>; +defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", ARMvmullu, sub>; // VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -41,12 +41,13 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umlal r0, r1, r3, r2 +; CHECK-NEXT: vmullb.u32 q2, q0, q1 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -59,12 +60,13 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: add_v2i32_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmullb.s32 q2, q0, q1 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i32> %x to <2 x i64> @@ -314,10 +316,8 @@ ; CHECK-NEXT: vmov.16 q3[6], r0 ; CHECK-NEXT: vmov.u8 r0, q0[15] ; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmovlb.u8 q2, q2 -; CHECK-NEXT: vmovlb.u8 q3, q3 ; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmul.i16 q2, q3, q2 +; CHECK-NEXT: vmullb.u8 q2, q3, q2 ; CHECK-NEXT: vmov.16 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[1] ; CHECK-NEXT: vmov.16 q3[1], r0 @@ -334,24 +334,22 @@ ; CHECK-NEXT: vmov.u8 r0, q1[7] ; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmovlb.u8 q1, q3 -; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.16 q1[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.16 q1[1], r0 ; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov.16 q1[2], r0 ; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.16 q1[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.16 q1[4], r0 ; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.16 q1[5], r0 ; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.16 q1[6], r0 ; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmovlb.u8 q0, q3 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmullb.u8 q0, q1, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: uxth r0, r0 @@ -399,10 +397,8 @@ ; CHECK-NEXT: vmov.16 q3[6], r0 ; CHECK-NEXT: vmov.u8 r0, q0[15] ; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmovlb.s8 q2, q2 -; CHECK-NEXT: vmovlb.s8 q3, q3 ; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmul.i16 q2, q3, q2 +; CHECK-NEXT: vmullb.s8 q2, q3, q2 ; CHECK-NEXT: vmov.16 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[1] ; CHECK-NEXT: vmov.16 q3[1], r0 @@ -419,24 +415,22 @@ ; CHECK-NEXT: vmov.u8 r0, q1[7] ; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmovlb.s8 q1, q3 -; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.16 q1[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.16 q1[1], r0 ; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov.16 q1[2], r0 ; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.16 q1[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.16 q1[4], r0 ; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.16 q1[5], r0 ; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.16 q1[6], r0 ; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmovlb.s8 q0, q3 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmullb.s8 q0, q1, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: sxth r0, r0 @@ -452,9 +446,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) { ; CHECK-LABEL: add_v8i8_v8i16_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmullb.u8 q0, q0, q1 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: uxth r0, r0 ; CHECK-NEXT: bx lr @@ -469,9 +461,7 @@ define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) { ; CHECK-LABEL: add_v8i8_v8i16_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmullb.s8 q0, q0, q1 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr @@ -1014,14 +1004,15 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s6 -; CHECK-NEXT: umull r2, lr, r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umlal r2, lr, r3, r12 +; CHECK-NEXT: vmullb.u32 q2, q0, q1 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r12, s11 +; CHECK-NEXT: vmov lr, s9 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -1037,14 +1028,15 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s6 -; CHECK-NEXT: smull r2, lr, r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: smlal r2, lr, r3, r12 +; CHECK-NEXT: vmullb.s32 q2, q0, q1 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r12, s11 +; CHECK-NEXT: vmov lr, s9 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <2 x i32> %x to <2 x i64> @@ -1316,10 +1308,8 @@ ; CHECK-NEXT: vmov.16 q3[6], r1 ; CHECK-NEXT: vmov.u8 r1, q0[15] ; CHECK-NEXT: vmov.16 q3[7], r1 -; CHECK-NEXT: vmovlb.u8 q2, q2 -; CHECK-NEXT: vmovlb.u8 q3, q3 ; CHECK-NEXT: vmov.u8 r1, q1[0] -; CHECK-NEXT: vmul.i16 q2, q3, q2 +; CHECK-NEXT: vmullb.u8 q2, q3, q2 ; CHECK-NEXT: vmov.16 q3[0], r1 ; CHECK-NEXT: vmov.u8 r1, q1[1] ; CHECK-NEXT: vmov.16 q3[1], r1 @@ -1336,24 +1326,22 @@ ; CHECK-NEXT: vmov.u8 r1, q1[7] ; CHECK-NEXT: vmov.16 q3[7], r1 ; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmovlb.u8 q1, q3 -; CHECK-NEXT: vmov.16 q3[0], r1 +; CHECK-NEXT: vmov.16 q1[0], r1 ; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.16 q3[1], r1 +; CHECK-NEXT: vmov.16 q1[1], r1 ; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.16 q3[2], r1 +; CHECK-NEXT: vmov.16 q1[2], r1 ; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.16 q3[3], r1 +; CHECK-NEXT: vmov.16 q1[3], r1 ; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov.16 q3[4], r1 +; CHECK-NEXT: vmov.16 q1[4], r1 ; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.16 q3[5], r1 +; CHECK-NEXT: vmov.16 q1[5], r1 ; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.16 q3[6], r1 +; CHECK-NEXT: vmov.16 q1[6], r1 ; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.16 q3[7], r1 -; CHECK-NEXT: vmovlb.u8 q0, q3 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmov.16 q1[7], r1 +; CHECK-NEXT: vmullb.u8 q0, q1, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: uxth r0, r0 @@ -1402,10 +1390,8 @@ ; CHECK-NEXT: vmov.16 q3[6], r1 ; CHECK-NEXT: vmov.u8 r1, q0[15] ; CHECK-NEXT: vmov.16 q3[7], r1 -; CHECK-NEXT: vmovlb.s8 q2, q2 -; CHECK-NEXT: vmovlb.s8 q3, q3 ; CHECK-NEXT: vmov.u8 r1, q1[0] -; CHECK-NEXT: vmul.i16 q2, q3, q2 +; CHECK-NEXT: vmullb.s8 q2, q3, q2 ; CHECK-NEXT: vmov.16 q3[0], r1 ; CHECK-NEXT: vmov.u8 r1, q1[1] ; CHECK-NEXT: vmov.16 q3[1], r1 @@ -1422,24 +1408,22 @@ ; CHECK-NEXT: vmov.u8 r1, q1[7] ; CHECK-NEXT: vmov.16 q3[7], r1 ; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmovlb.s8 q1, q3 -; CHECK-NEXT: vmov.16 q3[0], r1 +; CHECK-NEXT: vmov.16 q1[0], r1 ; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.16 q3[1], r1 +; CHECK-NEXT: vmov.16 q1[1], r1 ; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.16 q3[2], r1 +; CHECK-NEXT: vmov.16 q1[2], r1 ; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.16 q3[3], r1 +; CHECK-NEXT: vmov.16 q1[3], r1 ; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov.16 q3[4], r1 +; CHECK-NEXT: vmov.16 q1[4], r1 ; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.16 q3[5], r1 +; CHECK-NEXT: vmov.16 q1[5], r1 ; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.16 q3[6], r1 +; CHECK-NEXT: vmov.16 q1[6], r1 ; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.16 q3[7], r1 -; CHECK-NEXT: vmovlb.s8 q0, q3 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmov.16 q1[7], r1 +; CHECK-NEXT: vmullb.s8 q0, q1, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: sxth r0, r0 @@ -1456,9 +1440,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) { ; CHECK-LABEL: add_v8i8_v8i16_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmullb.u8 q0, q0, q1 ; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: uxth r0, r0 ; CHECK-NEXT: bx lr @@ -1474,9 +1456,7 @@ define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) { ; CHECK-LABEL: add_v8i8_v8i16_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmullb.s8 q0, q0, q1 ; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr Index: llvm/test/CodeGen/Thumb2/mve-vmull.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmull.ll +++ llvm/test/CodeGen/Thumb2/mve-vmull.ll @@ -4,16 +4,7 @@ define arm_aapcs_vfpcc <2 x i64> @sext_02(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: sext_02: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r1 +; CHECK-NEXT: vmullb.s32 q2, q0, q1 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: @@ -28,18 +19,8 @@ define arm_aapcs_vfpcc <2 x i64> @sext_13(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: sext_13: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vrev64.32 q2, q1 -; CHECK-NEXT: vrev64.32 q1, q0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: vmullt.s32 q2, q0, q1 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> @@ -53,16 +34,7 @@ define arm_aapcs_vfpcc <2 x i64> @zext_02(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: zext_02: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r1 +; CHECK-NEXT: vmullb.u32 q2, q0, q1 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: @@ -77,18 +49,8 @@ define arm_aapcs_vfpcc <2 x i64> @zext_13(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: zext_13: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vrev64.32 q2, q1 -; CHECK-NEXT: vrev64.32 q1, q0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: vmullt.u32 q2, q0, q1 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> @@ -103,9 +65,7 @@ define arm_aapcs_vfpcc <4 x i32> @sext_0246(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: sext_0246: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vmullb.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> @@ -119,9 +79,7 @@ define arm_aapcs_vfpcc <4 x i32> @sext_1357(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: sext_1357: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.s16 q1, q1 -; CHECK-NEXT: vmovlt.s16 q0, q0 -; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vmullt.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> @@ -135,9 +93,7 @@ define arm_aapcs_vfpcc <4 x i32> @zext_0246(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: zext_0246: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vmullb.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> @@ -151,9 +107,7 @@ define arm_aapcs_vfpcc <4 x i32> @zext_1357(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: zext_1357: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.u16 q1, q1 -; CHECK-NEXT: vmovlt.u16 q0, q0 -; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vmullt.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> @@ -167,9 +121,7 @@ define arm_aapcs_vfpcc <8 x i16> @sext_02468101214(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: sext_02468101214: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmullb.s8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> @@ -183,9 +135,7 @@ define arm_aapcs_vfpcc <8 x i16> @sext_13579111315(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: sext_13579111315: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.s8 q1, q1 -; CHECK-NEXT: vmovlt.s8 q0, q0 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmullt.s8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> @@ -199,9 +149,7 @@ define arm_aapcs_vfpcc <8 x i16> @zext_02468101214(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: zext_02468101214: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmullb.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> @@ -215,9 +163,7 @@ define arm_aapcs_vfpcc <8 x i16> @zext_13579111315(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: zext_13579111315: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.u8 q1, q1 -; CHECK-NEXT: vmovlt.u8 q0, q0 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmullt.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32>