Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1775,6 +1775,7 @@ setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); @@ -19729,6 +19730,36 @@ switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::AVG: { + // Legalize types for X86ISD::AVG by expanding vectors. + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + + auto InVT = N->getValueType(0); + auto InVTSize = InVT.getSizeInBits(); + const unsigned RegSize = + (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; + assert((!Subtarget->hasAVX512() || RegSize < 512) && + "512-bit vector requires AVX512"); + assert((!Subtarget->hasAVX2() || RegSize < 256) && + "256-bit vector requires AVX2"); + + auto ElemVT = InVT.getVectorElementType(); + auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, + RegSize / ElemVT.getSizeInBits()); + assert(RegSize % InVT.getSizeInBits() == 0); + unsigned NumConcat = RegSize / InVT.getSizeInBits(); + + SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); + Ops[0] = N->getOperand(0); + SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + Ops[0] = N->getOperand(1); + SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + + SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); + Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + DAG.getIntPtrConstant(0, dl))); + return; + } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -25222,6 +25253,105 @@ return SDValue(); } +/// This function detects the AVG pattern between vectors of unsigned i8/i16, +/// which is c = (a + b + 1) / 2, and replace this operation with the efficient +/// X86ISD::AVG instruction. +static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget *Subtarget, SDLoc DL) { + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + if (!((VT.getVectorElementType() == MVT::i8 || + VT.getVectorElementType() == MVT::i16) && + InVT.getVectorElementType() == MVT::i32 && isPowerOf2_32(NumElems))) + return SDValue(); + + if (Subtarget->hasAVX512()) { + if (VT.getSizeInBits() > 512) + return SDValue(); + } else if (Subtarget->hasAVX2()) { + if (VT.getSizeInBits() > 256) + return SDValue(); + } else { + if (VT.getSizeInBits() > 128) + return SDValue(); + } + + // Detect the following pattern: + // + // %1 = zext %a to + // %2 = zext %b to + // %3 = add nuw nsw %1, + // %4 = add nuw nsw %3, %2 + // %5 = lshr %N, + // %6 = trunc %5 to + // + // In AVX512, the last instruction can also be a trunc store. + + if (In.getOpcode() != ISD::SRL) + return SDValue(); + + // A lambda checking the given SDValue is a constant vector with all ones. + auto IsConstVectorOfOnes = [](SDValue V) { + BuildVectorSDNode *BV = dyn_cast(V); + if (!BV || !BV->isConstant()) + return false; + auto NumOperands = V.getNumOperands(); + for (unsigned i = 0; i < NumOperands; i++) { + ConstantSDNode *C = dyn_cast(V.getOperand(i)); + if (!C || !C->isOne()) + return false; } + return true; + }; + + // Check if each element of the vector is left-shifted by one. + auto LHS = In.getOperand(0); + auto RHS = In.getOperand(1); + if (!IsConstVectorOfOnes(RHS)) + return SDValue(); + if (LHS.getOpcode() != ISD::ADD) + return SDValue(); + + // Detect a pattern of a + b + 1 where the order doesn't matter. + SDValue Operands[3]; + Operands[0] = LHS.getOperand(0); + Operands[1] = LHS.getOperand(1); + if (Operands[0].getOpcode() == ISD::ADD) + std::swap(Operands[0], Operands[1]); + else if (Operands[1].getOpcode() != ISD::ADD) + return SDValue(); + Operands[2] = Operands[1].getOperand(0); + Operands[1] = Operands[1].getOperand(1); + + // Now we have three operands of two additions. Check that one of them is a + // constant vector with ones, and the other two are promoted from i8/i16. + for (int i = 0; i < 3; ++i) { + if (!IsConstVectorOfOnes(Operands[i])) + continue; + std::swap(Operands[i], Operands[2]); + + // Check if Operands[0] and Operands[1] are results of type promotion. + for (int j = 0; j < 2; ++j) + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + + // The pattern is detected, emit X86ISD::AVG instruction. + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1].getOperand(0)); + } + + return SDValue(); +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget, + SDLoc(N)); +} + /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -25486,6 +25616,16 @@ // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { + // Check if we can detect an AVG pattern from the truncation. If yes, + // replace the trunc store by a normal store with the result of X86ISD::AVG + // instruction. + SDValue Avg = + detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); + if (Avg.getNode()) + return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); @@ -26748,6 +26888,7 @@ case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); + case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); case X86ISD::FMIN: Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -4062,6 +4062,10 @@ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; // Intrinsic forms defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, @@ -4078,10 +4082,6 @@ int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; -defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, - int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; -defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, - int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, int_x86_avx2_psad_bw, SSE_PMADD, 1>; Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -250,6 +250,8 @@ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -1687,6 +1689,8 @@ X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), Index: test/CodeGen/X86/avg.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avg.ll @@ -0,0 +1,530 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW + +define void @avg_v4i8(<4 x i8> %a, <4 x i8> %b) { +; SSE2-LABEL: avg_v4i8 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v4i8 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <4 x i8> %a to <4 x i32> + %2 = zext <4 x i8> %b to <4 x i32> + %3 = add nuw nsw <4 x i32> %1, + %4 = add nuw nsw <4 x i32> %3, %2 + %5 = lshr <4 x i32> %4, + %6 = trunc <4 x i32> %5 to <4 x i8> + store <4 x i8> %6, <4 x i8>* undef, align 4 + ret void +} + +define void @avg_v8i8(<8 x i8> %a, <8 x i8> %b) { +; SSE2-LABEL: avg_v8i8 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v8i8 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <8 x i8> %a to <8 x i32> + %2 = zext <8 x i8> %b to <8 x i32> + %3 = add nuw nsw <8 x i32> %1, + %4 = add nuw nsw <8 x i32> %3, %2 + %5 = lshr <8 x i32> %4, + %6 = trunc <8 x i32> %5 to <8 x i8> + store <8 x i8> %6, <8 x i8>* undef, align 4 + ret void +} + +define void @avg_v16i8(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: avg_v16i8 +; SSE2: # BB#0: +; SSE2-NEXT: pavgb %xmm1, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v16i8 +; AVX2: # BB#0: +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v16i8 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <16 x i8> %a to <16 x i32> + %2 = zext <16 x i8> %b to <16 x i32> + %3 = add nuw nsw <16 x i32> %1, + %4 = add nuw nsw <16 x i32> %3, %2 + %5 = lshr <16 x i32> %4, + %6 = trunc <16 x i32> %5 to <16 x i8> + store <16 x i8> %6, <16 x i8>* undef, align 4 + ret void +} + +define void @avg_v32i8(<32 x i8> %a, <32 x i8> %b) { +; AVX2-LABEL: avg_v32i8 +; AVX2: # BB#0: +; AVX2-NEXT: vpavgb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v32i8 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgb %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <32 x i8> %a to <32 x i32> + %2 = zext <32 x i8> %b to <32 x i32> + %3 = add nuw nsw <32 x i32> %1, + %4 = add nuw nsw <32 x i32> %3, %2 + %5 = lshr <32 x i32> %4, + %6 = trunc <32 x i32> %5 to <32 x i8> + store <32 x i8> %6, <32 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8(<64 x i8> %a, <64 x i8> %b) { +; AVX512BW-LABEL: avg_v64i8 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgb %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <64 x i8> %a to <64 x i32> + %2 = zext <64 x i8> %b to <64 x i32> + %3 = add nuw nsw <64 x i32> %1, + %4 = add nuw nsw <64 x i32> %3, %2 + %5 = lshr <64 x i32> %4, + %6 = trunc <64 x i32> %5 to <64 x i8> + store <64 x i8> %6, <64 x i8>* undef, align 4 + ret void +} + +define void @avg_v4i16(<4 x i16> %a, <4 x i16> %b) { +; SSE2-LABEL: avg_v4i16 +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pavgw %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i16 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v4i16 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <4 x i16> %a to <4 x i32> + %2 = zext <4 x i16> %b to <4 x i32> + %3 = add nuw nsw <4 x i32> %1, + %4 = add nuw nsw <4 x i32> %3, %2 + %5 = lshr <4 x i32> %4, + %6 = trunc <4 x i32> %5 to <4 x i16> + store <4 x i16> %6, <4 x i16>* undef, align 4 + ret void +} + +define void @avg_v8i16(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: avg_v8i16 +; SSE2: # BB#0: +; SSE2-NEXT: pavgw %xmm1, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i16 +; AVX2: # BB#0: +; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v8i16 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <8 x i16> %a to <8 x i32> + %2 = zext <8 x i16> %b to <8 x i32> + %3 = add nuw nsw <8 x i32> %1, + %4 = add nuw nsw <8 x i32> %3, %2 + %5 = lshr <8 x i32> %4, + %6 = trunc <8 x i32> %5 to <8 x i16> + store <8 x i16> %6, <8 x i16>* undef, align 4 + ret void +} + +define void @avg_v16i16(<16 x i16> %a, <16 x i16> %b) { +; AVX2-LABEL: avg_v16i16 +; AVX2: # BB#0: +; AVX2-NEXT: vpavgw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v16i16 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgw %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <16 x i16> %a to <16 x i32> + %2 = zext <16 x i16> %b to <16 x i32> + %3 = add nuw nsw <16 x i32> %1, + %4 = add nuw nsw <16 x i32> %3, %2 + %5 = lshr <16 x i32> %4, + %6 = trunc <16 x i32> %5 to <16 x i16> + store <16 x i16> %6, <16 x i16>* undef, align 4 + ret void +} + +define void @avg_v32i16(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: avg_v32i16 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <32 x i16> %a to <32 x i32> + %2 = zext <32 x i16> %b to <32 x i32> + %3 = add nuw nsw <32 x i32> %1, + %4 = add nuw nsw <32 x i32> %3, %2 + %5 = lshr <32 x i32> %4, + %6 = trunc <32 x i32> %5 to <32 x i16> + store <32 x i16> %6, <32 x i16>* undef, align 4 + ret void +} + +define void @avg_v4i8_2(<4 x i8> %a, <4 x i8> %b) { +; SSE2-LABEL: avg_v4i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pavgb %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v4i8_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <4 x i8> %a to <4 x i32> + %2 = zext <4 x i8> %b to <4 x i32> + %3 = add nuw nsw <4 x i32> %1, %2 + %4 = add nuw nsw <4 x i32> %3, + %5 = lshr <4 x i32> %4, + %6 = trunc <4 x i32> %5 to <4 x i8> + store <4 x i8> %6, <4 x i8>* undef, align 4 + ret void +} + +define void @avg_v8i8_2(<8 x i8> %a, <8 x i8> %b) { +; SSE2-LABEL: avg_v8i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pavgb %xmm1, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v8i8_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <8 x i8> %a to <8 x i32> + %2 = zext <8 x i8> %b to <8 x i32> + %3 = add nuw nsw <8 x i32> %1, %2 + %4 = add nuw nsw <8 x i32> %3, + %5 = lshr <8 x i32> %4, + %6 = trunc <8 x i32> %5 to <8 x i8> + store <8 x i8> %6, <8 x i8>* undef, align 4 + ret void +} + +define void @avg_v16i8_2(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: avg_v16i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: pavgb %xmm1, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v16i8_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <16 x i8> %a to <16 x i32> + %2 = zext <16 x i8> %b to <16 x i32> + %3 = add nuw nsw <16 x i32> %1, %2 + %4 = add nuw nsw <16 x i32> %3, + %5 = lshr <16 x i32> %4, + %6 = trunc <16 x i32> %5 to <16 x i8> + store <16 x i8> %6, <16 x i8>* undef, align 4 + ret void +} + +define void @avg_v32i8_2(<32 x i8> %a, <32 x i8> %b) { +; AVX2-LABEL: avg_v32i8 +; AVX2: # BB#0: +; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v32i8 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <32 x i8> %a to <32 x i32> + %2 = zext <32 x i8> %b to <32 x i32> + %3 = add nuw nsw <32 x i32> %1, %2 + %4 = add nuw nsw <32 x i32> %3, + %5 = lshr <32 x i32> %4, + %6 = trunc <32 x i32> %5 to <32 x i8> + store <32 x i8> %6, <32 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8_2(<64 x i8> %a, <64 x i8> %b) { +; AVX512BW-LABEL: avg_v64i8_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgb %zmm1, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <64 x i8> %a to <64 x i32> + %2 = zext <64 x i8> %b to <64 x i32> + %3 = add nuw nsw <64 x i32> %2, %2 + %4 = add nuw nsw <64 x i32> %3, + %5 = lshr <64 x i32> %4, + %6 = trunc <64 x i32> %5 to <64 x i8> + store <64 x i8> %6, <64 x i8>* undef, align 4 + ret void +} + + +define void @avg_v4i16_2(<4 x i16> %a, <4 x i16> %b) { +; SSE2-LABEL: avg_v4i16_2 +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pavgw %xmm1, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v4i16_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <4 x i16> %a to <4 x i32> + %2 = zext <4 x i16> %b to <4 x i32> + %3 = add nuw nsw <4 x i32> %1, %2 + %4 = add nuw nsw <4 x i32> %3, + %5 = lshr <4 x i32> %4, + %6 = trunc <4 x i32> %5 to <4 x i16> + store <4 x i16> %6, <4 x i16>* undef, align 4 + ret void +} + +define void @avg_v8i16_2(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: avg_v8i16_2 +; SSE2: # BB#0: +; SSE2-NEXT: pavgw %xmm1, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v8i16_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <8 x i16> %a to <8 x i32> + %2 = zext <8 x i16> %b to <8 x i32> + %3 = add nuw nsw <8 x i32> %1, %2 + %4 = add nuw nsw <8 x i32> %3, + %5 = lshr <8 x i32> %4, + %6 = trunc <8 x i32> %5 to <8 x i16> + store <8 x i16> %6, <8 x i16>* undef, align 4 + ret void +} + +define void @avg_v16i16_2(<16 x i16> %a, <16 x i16> %b) { +; AVX2-LABEL: avg_v16i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: avg_v16i16_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <16 x i16> %a to <16 x i32> + %2 = zext <16 x i16> %b to <16 x i32> + %3 = add nuw nsw <16 x i32> %1, %2 + %4 = add nuw nsw <16 x i32> %3, + %5 = lshr <16 x i32> %4, + %6 = trunc <16 x i32> %5 to <16 x i16> + store <16 x i16> %6, <16 x i16>* undef, align 4 + ret void +} + +define void @avg_v32i16_2(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: avg_v32i16_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = zext <32 x i16> %a to <32 x i32> + %2 = zext <32 x i16> %b to <32 x i32> + %3 = add nuw nsw <32 x i32> %1, %2 + %4 = add nuw nsw <32 x i32> %3, + %5 = lshr <32 x i32> %4, + %6 = trunc <32 x i32> %5 to <32 x i16> + store <32 x i16> %6, <32 x i16>* undef, align 4 + ret void +}