diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -249,9 +249,6 @@ SCALEFS, SCALEFS_RND, - // Unsigned Integer average. - AVG, - /// Integer horizontal add/sub. HADD, HSUB, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -949,6 +949,8 @@ setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal); setOperationAction(ISD::SMULO, MVT::v16i8, Custom); setOperationAction(ISD::UMULO, MVT::v16i8, Custom); @@ -1353,6 +1355,10 @@ setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i8, Custom); + if (HasInt256) { + setOperationAction(ISD::AVGCEILU, MVT::v16i16, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v32i8, Legal); + } setOperationAction(ISD::SMULO, MVT::v32i8, Custom); setOperationAction(ISD::UMULO, MVT::v32i8, Custom); @@ -1652,6 +1658,10 @@ setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v64i8, Custom); + if (HasBWI) { + setOperationAction(ISD::AVGCEILU, MVT::v32i16, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v64i8, Legal); + } setOperationAction(ISD::SMULO, MVT::v64i8, Custom); setOperationAction(ISD::UMULO, MVT::v64i8, Custom); @@ -31807,9 +31817,8 @@ Results.push_back(Res); return; } - case X86ISD::VPMADDWD: - case X86ISD::AVG: { - // Legalize types for X86ISD::AVG/VPMADDWD by widening. + case X86ISD::VPMADDWD: { + // Legalize types for X86ISD::VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); @@ -33041,7 +33050,6 @@ NODE_NAME_CASE(SCALEF_RND) NODE_NAME_CASE(SCALEFS) NODE_NAME_CASE(SCALEFS_RND) - NODE_NAME_CASE(AVG) NODE_NAME_CASE(MULHRS) NODE_NAME_CASE(SINT_TO_FP_RND) NODE_NAME_CASE(UINT_TO_FP_RND) @@ -33222,7 +33230,6 @@ bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const { switch (Opcode) { // TODO: Add more X86ISD opcodes once we have test coverage. - case X86ISD::AVG: case X86ISD::PCMPEQ: case X86ISD::PMULDQ: case X86ISD::PMULUDQ: @@ -40632,7 +40639,6 @@ case X86ISD::UNPCKH: case X86ISD::BLENDI: // Integer ops. - case X86ISD::AVG: case X86ISD::PACKSS: case X86ISD::PACKUS: // Horizontal Ops. @@ -47789,7 +47795,7 @@ /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient -/// X86ISD::AVG instruction. +/// ISD::AVGCEILU (AVG) instruction. static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { @@ -47852,7 +47858,7 @@ auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops); + return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops); }; auto AVGSplitter = [&](std::array Ops) { diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5039,7 +5039,7 @@ HasBWI, 1>; defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SchedWriteVecIMul, HasBWI, 1>, T8PD; -defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, +defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", avgceilu, SchedWriteVecALU, HasBWI, 1>; defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq, SchedWriteVecIMul, HasAVX512, 1>, T8PD; diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -287,7 +287,6 @@ SDTCisSameAs<2, 1>]>; def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>; -def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -3471,9 +3471,9 @@ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, +defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, +defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, SchedWriteVecIMul, 1, NoVLX>; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -371,8 +371,8 @@ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), @@ -818,8 +818,8 @@ X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0), @@ -1281,8 +1281,8 @@ X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -64,17 +64,31 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: pavgb (%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgb (%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512-NEXT: vpavgb (%rdi), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a %2 = load <16 x i8>, <16 x i8>* %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -162,16 +176,16 @@ ; ; AVX2-LABEL: avg_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -313,8 +327,8 @@ ; ; AVX512BW-LABEL: avg_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -361,15 +375,15 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: pavgw (%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgw (%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX-LABEL: avg_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu %xmm0, (%rax) ; AVX-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a @@ -407,8 +421,8 @@ ; ; AVX2-LABEL: avg_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -489,8 +503,8 @@ ; ; AVX512BW-LABEL: avg_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -561,15 +575,15 @@ ; ; AVX512F-LABEL: avg_v40i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-NEXT: vpavgw 64(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqu %ymm2, (%rax) +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-NEXT: vpavgw 64(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) -; AVX512F-NEXT: vmovdqu %xmm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %xmm2, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2645,7 +2659,7 @@ ; ; AVX-LABEL: PR52131_pavg_chain: ; AVX: # %bb.0: -; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpavgw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %i = zext <8 x i16> %a to <8 x i32> diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -72,8 +72,8 @@ define dso_local void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" { ; CHECK-LABEL: avg_v64i8_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0 -; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpavgb (%rsi), %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq