Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -322,6 +322,8 @@ case ISD::ADD: case ISD::SUB: case ISD::MUL: + case ISD::MULHS: + case ISD::MULHU: case ISD::SDIV: case ISD::UDIV: case ISD::SREM: Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -714,8 +714,8 @@ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { - setOperationAction(ISD::MULHS, VT, Custom); - setOperationAction(ISD::MULHU, VT, Custom); + setOperationAction(ISD::MULHS, VT, Legal); + setOperationAction(ISD::MULHU, VT, Legal); } else { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); @@ -2670,66 +2670,6 @@ DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } -// Lower vector multiply high (ISD::MULHS and ISD::MULHU). -static SDValue LowerMULH(SDValue Op, SelectionDAG &DAG) { - // Multiplications are only custom-lowered for 128-bit vectors so that - // {S,U}MULL{2} can be detected. Otherwise v2i64 multiplications are not - // legal. - EVT VT = Op.getValueType(); - assert(VT.is128BitVector() && VT.isInteger() && - "unexpected type for custom-lowering ISD::MULH{U,S}"); - - SDValue V0 = Op.getOperand(0); - SDValue V1 = Op.getOperand(1); - - SDLoc DL(Op); - - EVT ExtractVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); - - // We turn (V0 mulhs/mulhu V1) to: - // - // (uzp2 (smull (extract_subvector (ExtractVT V128:V0, (i64 0)), - // (extract_subvector (ExtractVT V128:V1, (i64 0))))), - // (smull (extract_subvector (ExtractVT V128:V0, (i64 VMull2Idx)), - // (extract_subvector (ExtractVT V128:V2, (i64 VMull2Idx)))))) - // - // Where ExtractVT is a subvector with half number of elements, and - // VMullIdx2 is the index of the middle element (the high part). - // - // The vector hight part extract and multiply will be matched against - // {S,U}MULL{v16i8_v8i16,v8i16_v4i32,v4i32_v2i64} which in turn will - // issue a {s}mull2 instruction. - // - // This basically multiply the lower subvector with '{s,u}mull', the high - // subvector with '{s,u}mull2', and shuffle both results high part in - // resulting vector. - unsigned Mull2VectorIdx = VT.getVectorNumElements () / 2; - SDValue VMullIdx = DAG.getConstant(0, DL, MVT::i64); - SDValue VMull2Idx = DAG.getConstant(Mull2VectorIdx, DL, MVT::i64); - - SDValue VMullV0 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMullIdx); - SDValue VMullV1 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMullIdx); - - SDValue VMull2V0 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMull2Idx); - SDValue VMull2V1 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMull2Idx); - - unsigned MullOpc = Op.getOpcode() == ISD::MULHS ? AArch64ISD::SMULL - : AArch64ISD::UMULL; - - EVT MullVT = ExtractVT.widenIntegerVectorElementType(*DAG.getContext()); - SDValue Mull = DAG.getNode(MullOpc, DL, MullVT, VMullV0, VMullV1); - SDValue Mull2 = DAG.getNode(MullOpc, DL, MullVT, VMull2V0, VMull2V1); - - Mull = DAG.getNode(ISD::BITCAST, DL, VT, Mull); - Mull2 = DAG.getNode(ISD::BITCAST, DL, VT, Mull2); - - return DAG.getNode(AArch64ISD::UZP2, DL, VT, Mull, Mull2); -} - SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); @@ -2932,9 +2872,6 @@ return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); - case ISD::MULHS: - case ISD::MULHU: - return LowerMULH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::STORE: Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -4108,25 +4108,6 @@ defm : Neon_mul_widen_patterns; -// Patterns for smull2/umull2. -multiclass Neon_mul_high_patterns { - def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn), - (extract_high_v16i8 V128:$Rm))), - (INST8B V128:$Rn, V128:$Rm)>; - def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 V128:$Rm))), - (INST4H V128:$Rn, V128:$Rm)>; - def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 V128:$Rm))), - (INST2S V128:$Rn, V128:$Rm)>; -} - -defm : Neon_mul_high_patterns; -defm : Neon_mul_high_patterns; - // Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL multiclass Neon_mulacc_widen_patterns { @@ -5984,6 +5965,41 @@ // __builtin_trap() uses the BRK instruction on AArch64. def : Pat<(trap), (BRK 1)>; +// Multiply high patterns which multiply the lower subvector using smull/umull +// and the upper subvector with smull2/umull2. Then shuffle the high the high +// part of both results together. +def : Pat<(v16i8 (mulhs V128:$Rn, V128:$Rm)), + (UZP2v16i8 + (SMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (SMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>; +def : Pat<(v8i16 (mulhs V128:$Rn, V128:$Rm)), + (UZP2v8i16 + (SMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (SMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>; +def : Pat<(v4i32 (mulhs V128:$Rn, V128:$Rm)), + (UZP2v4i32 + (SMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (SMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>; + +def : Pat<(v16i8 (mulhu V128:$Rn, V128:$Rm)), + (UZP2v16i8 + (UMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (UMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>; +def : Pat<(v8i16 (mulhu V128:$Rn, V128:$Rm)), + (UZP2v8i16 + (UMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (UMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>; +def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)), + (UZP2v4i32 + (UMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>; + // Conversions within AdvSIMD types in the same register size are free. // But because we need a consistent lane ordering, in big endian many // conversions require one or more REV instructions. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -11317,7 +11317,7 @@ SDValue BC = peekThroughBitcasts(V); // Also check the simpler case, where we can directly reuse the scalar. - if (V.getOpcode() == ISD::BUILD_VECTOR || + if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); Index: test/CodeGen/X86/combine-udiv.ll =================================================================== --- test/CodeGen/X86/combine-udiv.ll +++ test/CodeGen/X86/combine-udiv.ll @@ -647,18 +647,17 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movl $255, %eax -; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movl $171, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pmullw %xmm2, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3 -; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: movl $255, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_udiv_nonuniform4: @@ -670,14 +669,12 @@ ; SSE41-NEXT: pmullw %xmm0, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: psllw $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; SSE41-NEXT: psllw $1, %xmm0 +; SSE41-NEXT: psllw $8, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm2 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -690,14 +687,12 @@ ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq @@ -726,13 +721,9 @@ ; XOP-NEXT: movl $171, %eax ; XOP-NEXT: vmovd %eax, %xmm1 ; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm2 -; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOP-NEXT: vpmullw %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm2[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15] +; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15] ; XOP-NEXT: movl $249, %eax ; XOP-NEXT: vmovd %eax, %xmm2 ; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1 Index: test/CodeGen/X86/urem-seteq-vec-nonsplat.ll =================================================================== --- test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -629,12 +629,12 @@ ; ; CHECK-AVX1-LABEL: test_urem_both: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-9.255967385052751E+61,-9.255967385052751E+61] -; CHECK-AVX1-NEXT: # xmm1 = mem[0,0] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,3435973837] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 @@ -645,7 +645,7 @@ ; ; CHECK-AVX2-LABEL: test_urem_both: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14757395262689946283,14757395262689946283] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,3435973837] ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 @@ -661,7 +661,7 @@ ; ; CHECK-AVX512VL-LABEL: test_urem_both: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14757395262689946283,14757395262689946283] +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,3435973837] ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2