diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -46766,6 +46766,11 @@ }); }; + auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) { + unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits(); + return MaxActiveBits <= ScalarVT.getSizeInBits(); + }; + // Check if each element of the vector is right-shifted by one. SDValue LHS = In.getOperand(0); SDValue RHS = In.getOperand(1); @@ -46785,6 +46790,9 @@ }; auto AVGSplitter = [&](std::array Ops) { + for (SDValue &Op : Ops) + if (Op.getValueType() != VT) + Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op); // Pad to a power-of-2 vector, split+apply and extract the original vector. unsigned NumElemsPow2 = PowerOf2Ceil(NumElems); EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2); @@ -46855,12 +46863,9 @@ // Check if Operands[0] and Operands[1] are results of type promotion. for (int j = 0; j < 2; ++j) - if (Operands[j].getValueType() != VT) { - if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || - Operands[j].getOperand(0).getValueType() != VT) + if (Operands[j].getValueType() != VT) + if (!IsZExtLike(Operands[j])) return SDValue(); - Operands[j] = Operands[j].getOperand(0); - } // The pattern is detected, emit X86ISD::AVG instruction(s). return AVGSplitter({Operands[0], Operands[1]}); diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -2596,105 +2596,15 @@ define <8 x i16> @PR52131_pavg_chain(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { ; SSE2-LABEL: PR52131_pavg_chain: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE2-NEXT: paddd %xmm4, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: psubd %xmm4, %xmm5 -; SSE2-NEXT: psubd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: paddd %xmm5, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: psubd %xmm4, %xmm0 -; SSE2-NEXT: psubd %xmm4, %xmm2 -; SSE2-NEXT: pslld $15, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pslld $15, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 +; SSE2-NEXT: pavgw %xmm1, %xmm0 +; SSE2-NEXT: pavgw %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX1-LABEL: PR52131_pavg_chain: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsubd %xmm1, %xmm4, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: PR52131_pavg_chain: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7],ymm0[8],ymm3[9],ymm0[10],ymm3[11],ymm0[12],ymm3[13],ymm0[14],ymm3[15] -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: PR52131_pavg_chain: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7],ymm0[8],ymm3[9],ymm0[10],ymm3[11],ymm0[12],ymm3[13],ymm0[14],ymm3[15] -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: PR52131_pavg_chain: +; AVX: # %bb.0: +; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq %i = zext <8 x i16> %a to <8 x i32> %i1 = zext <8 x i16> %b to <8 x i32> %i2 = add nuw nsw <8 x i32> %i, @@ -2819,72 +2729,38 @@ define <8 x i16> @PR52131_pavg_with_mask(<8 x i32> %a, <8 x i16> %b) { ; SSE2-LABEL: PR52131_pavg_with_mask: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: psubd %xmm1, %xmm2 -; SSE2-NEXT: pslld $15, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pslld $15, %xmm0 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pavgw %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: PR52131_pavg_with_mask: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR52131_pavg_with_mask: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: PR52131_pavg_with_mask: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %i = and <8 x i32> %a,