Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -249,9 +249,6 @@ SCALEFS, SCALEFS_RND, - // Unsigned Integer average. - AVG, - /// Integer horizontal add/sub. HADD, HSUB, Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -949,6 +949,8 @@ setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal); setOperationAction(ISD::SMULO, MVT::v16i8, Custom); setOperationAction(ISD::UMULO, MVT::v16i8, Custom); @@ -1353,6 +1355,10 @@ setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i8, Custom); + if (HasInt256) { + setOperationAction(ISD::AVGCEILU, MVT::v16i16, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v32i8, Legal); + } setOperationAction(ISD::SMULO, MVT::v32i8, Custom); setOperationAction(ISD::UMULO, MVT::v32i8, Custom); @@ -1652,6 +1658,10 @@ setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v64i8, Custom); + if (HasBWI) { + setOperationAction(ISD::AVGCEILU, MVT::v32i16, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v64i8, Legal); + } setOperationAction(ISD::SMULO, MVT::v64i8, Custom); setOperationAction(ISD::UMULO, MVT::v64i8, Custom); @@ -31807,9 +31817,8 @@ Results.push_back(Res); return; } - case X86ISD::VPMADDWD: - case X86ISD::AVG: { - // Legalize types for X86ISD::AVG/VPMADDWD by widening. + case X86ISD::VPMADDWD: { + // Legalize types for X86ISD::VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); @@ -33041,7 +33050,6 @@ NODE_NAME_CASE(SCALEF_RND) NODE_NAME_CASE(SCALEFS) NODE_NAME_CASE(SCALEFS_RND) - NODE_NAME_CASE(AVG) NODE_NAME_CASE(MULHRS) NODE_NAME_CASE(SINT_TO_FP_RND) NODE_NAME_CASE(UINT_TO_FP_RND) @@ -33222,7 +33230,6 @@ bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const { switch (Opcode) { // TODO: Add more X86ISD opcodes once we have test coverage. - case X86ISD::AVG: case X86ISD::PCMPEQ: case X86ISD::PMULDQ: case X86ISD::PMULUDQ: @@ -40618,7 +40625,6 @@ case X86ISD::UNPCKH: case X86ISD::BLENDI: // Integer ops. - case X86ISD::AVG: case X86ISD::PACKSS: case X86ISD::PACKUS: // Horizontal Ops. @@ -47666,7 +47672,7 @@ /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient -/// X86ISD::AVG instruction. +/// ISD::AVGCEILU (AVG) instruction. static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { @@ -47729,7 +47735,7 @@ auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops); + return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops); }; auto AVGSplitter = [&](std::array Ops) { Index: llvm/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/lib/Target/X86/X86InstrAVX512.td +++ llvm/lib/Target/X86/X86InstrAVX512.td @@ -5039,7 +5039,7 @@ HasBWI, 1>; defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SchedWriteVecIMul, HasBWI, 1>, T8PD; -defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, +defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", avgceilu, SchedWriteVecALU, HasBWI, 1>; defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq, SchedWriteVecIMul, HasAVX512, 1>, T8PD; Index: llvm/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -287,7 +287,6 @@ SDTCisSameAs<2, 1>]>; def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>; -def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; Index: llvm/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/lib/Target/X86/X86InstrSSE.td +++ llvm/lib/Target/X86/X86InstrSSE.td @@ -3471,9 +3471,9 @@ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, +defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, +defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, SchedWriteVecIMul, 1, NoVLX>; Index: llvm/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -371,8 +371,8 @@ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), @@ -818,8 +818,8 @@ X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0), @@ -1281,8 +1281,8 @@ X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), Index: llvm/test/CodeGen/X86/avg.ll =================================================================== --- llvm/test/CodeGen/X86/avg.ll +++ llvm/test/CodeGen/X86/avg.ll @@ -64,15 +64,15 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: pavgb (%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgb (%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX-LABEL: avg_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu %xmm0, (%rax) ; AVX-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a @@ -162,16 +162,16 @@ ; ; AVX2-LABEL: avg_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -313,8 +313,8 @@ ; ; AVX512BW-LABEL: avg_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -361,15 +361,15 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: pavgw (%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgw (%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX-LABEL: avg_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu %xmm0, (%rax) ; AVX-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a @@ -407,16 +407,16 @@ ; ; AVX2-LABEL: avg_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -489,8 +489,8 @@ ; ; AVX512BW-LABEL: avg_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -561,15 +561,15 @@ ; ; AVX512F-LABEL: avg_v40i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-NEXT: vpavgw 64(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqu %ymm2, (%rax) +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-NEXT: vpavgw 64(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) -; AVX512F-NEXT: vmovdqu %xmm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %xmm2, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2645,8 +2645,8 @@ ; ; AVX-LABEL: PR52131_pavg_chain: ; AVX: # %bb.0: -; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpavgw %xmm0, %xmm2, %xmm0 ; AVX-NEXT: retq %i = zext <8 x i16> %a to <8 x i32> %i1 = zext <8 x i16> %b to <8 x i32> @@ -2665,95 +2665,15 @@ define <8 x i16> @PR52131_pavg_chainlike_but_not_zext(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { ; SSE2-LABEL: PR52131_pavg_chainlike_but_not_zext: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: paddd %xmm4, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: psubd %xmm1, %xmm5 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: psubd %xmm1, %xmm2 -; SSE2-NEXT: pslld $15, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pslld $15, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 +; SSE2-NEXT: pavgw %xmm1, %xmm0 +; SSE2-NEXT: pavgw %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX1-LABEL: PR52131_pavg_chainlike_but_not_zext: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsubd %xmm1, %xmm4, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: PR52131_pavg_chainlike_but_not_zext: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: PR52131_pavg_chainlike_but_not_zext: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: PR52131_pavg_chainlike_but_not_zext: +; AVX: # %bb.0: +; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpavgw %xmm0, %xmm2, %xmm0 +; AVX-NEXT: retq %i = zext <8 x i16> %a to <8 x i32> %i1 = zext <8 x i16> %b to <8 x i32> %i2 = add nuw nsw <8 x i32> %i, @@ -2784,7 +2704,7 @@ ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpavgw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -2792,7 +2712,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2800,7 +2720,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpavgw %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %i = and <8 x i32> %a, Index: llvm/test/CodeGen/X86/min-legal-vector-width.ll =================================================================== --- llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -72,8 +72,8 @@ define dso_local void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" { ; CHECK-LABEL: avg_v64i8_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0 -; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpavgb (%rsi), %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq