Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -1779,6 +1779,7 @@ setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); @@ -19853,6 +19854,36 @@ switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::AVG: { + // Legalize types for X86ISD::AVG by expanding vectors. + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + + auto InVT = N->getValueType(0); + auto InVTSize = InVT.getSizeInBits(); + const unsigned RegSize = + (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; + assert((!Subtarget->hasAVX512() || RegSize < 512) && + "512-bit vector requires AVX512"); + assert((!Subtarget->hasAVX2() || RegSize < 256) && + "256-bit vector requires AVX2"); + + auto ElemVT = InVT.getVectorElementType(); + auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, + RegSize / ElemVT.getSizeInBits()); + assert(RegSize % InVT.getSizeInBits() == 0); + unsigned NumConcat = RegSize / InVT.getSizeInBits(); + + SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); + Ops[0] = N->getOperand(0); + SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + Ops[0] = N->getOperand(1); + SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + + SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); + Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + DAG.getIntPtrConstant(0, dl))); + return; + } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -25347,6 +25378,132 @@ return SDValue(); } +/// This function detects the AVG pattern between vectors of unsigned i8/i16, +/// which is c = (a + b + 1) / 2, and replace this operation with the efficient +/// X86ISD::AVG instruction. +static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget *Subtarget, SDLoc DL) { + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + EVT ScalarVT = VT.getVectorElementType(); + if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && + isPowerOf2_32(NumElems))) + return SDValue(); + + // InScalarVT is the intermediate type in AVG pattern and it should be greater + // than the original input type (i8/i16). + EVT InScalarVT = InVT.getVectorElementType(); + if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) + return SDValue(); + + if (Subtarget->hasAVX512()) { + if (VT.getSizeInBits() > 512) + return SDValue(); + } else if (Subtarget->hasAVX2()) { + if (VT.getSizeInBits() > 256) + return SDValue(); + } else { + if (VT.getSizeInBits() > 128) + return SDValue(); + } + + // Detect the following pattern: + // + // %1 = zext %a to + // %2 = zext %b to + // %3 = add nuw nsw %1, + // %4 = add nuw nsw %3, %2 + // %5 = lshr %N, + // %6 = trunc %5 to + // + // In AVX512, the last instruction can also be a trunc store. + + if (In.getOpcode() != ISD::SRL) + return SDValue(); + + // A lambda checking the given SDValue is a constant vector and each element + // is in the range [Min, Max]. + auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { + BuildVectorSDNode *BV = dyn_cast(V); + if (!BV || !BV->isConstant()) + return false; + for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { + ConstantSDNode *C = dyn_cast(V.getOperand(i)); + if (!C) + return false; + uint64_t Val = C->getZExtValue(); + if (Val < Min || Val > Max) + return false; + } + return true; + }; + + // Check if each element of the vector is left-shifted by one. + auto LHS = In.getOperand(0); + auto RHS = In.getOperand(1); + if (!IsConstVectorInRange(RHS, 1, 1)) + return SDValue(); + if (LHS.getOpcode() != ISD::ADD) + return SDValue(); + + // Detect a pattern of a + b + 1 where the order doesn't matter. + SDValue Operands[3]; + Operands[0] = LHS.getOperand(0); + Operands[1] = LHS.getOperand(1); + + // Take care of the case when one of the operands is a constant vector whose + // element is in the range [1, 256]. + if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && + Operands[0].getOpcode() == ISD::ZERO_EXTEND && + Operands[0].getOperand(0).getValueType() == VT) { + // The pattern is detected. Subtract one from the constant vector, then + // demote it and emit X86ISD::AVG instruction. + SDValue One = DAG.getConstant(1, DL, InScalarVT); + SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, + SmallVector(NumElems, One)); + Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones); + Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1]); + } + + if (Operands[0].getOpcode() == ISD::ADD) + std::swap(Operands[0], Operands[1]); + else if (Operands[1].getOpcode() != ISD::ADD) + return SDValue(); + Operands[2] = Operands[1].getOperand(0); + Operands[1] = Operands[1].getOperand(1); + + // Now we have three operands of two additions. Check that one of them is a + // constant vector with ones, and the other two are promoted from i8/i16. + for (int i = 0; i < 3; ++i) { + if (!IsConstVectorInRange(Operands[i], 1, 1)) + continue; + std::swap(Operands[i], Operands[2]); + + // Check if Operands[0] and Operands[1] are results of type promotion. + for (int j = 0; j < 2; ++j) + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + + // The pattern is detected, emit X86ISD::AVG instruction. + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1].getOperand(0)); + } + + return SDValue(); +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget, + SDLoc(N)); +} + /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -25611,6 +25768,16 @@ // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { + // Check if we can detect an AVG pattern from the truncation. If yes, + // replace the trunc store by a normal store with the result of X86ISD::AVG + // instruction. + SDValue Avg = + detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); + if (Avg.getNode()) + return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); @@ -26873,6 +27040,7 @@ case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); + case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); case X86ISD::FMIN: Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -4046,6 +4046,10 @@ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; // Intrinsic forms defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, @@ -4062,10 +4066,6 @@ int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; -defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, - int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; -defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, - int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, int_x86_avx2_psad_bw, SSE_PMADD, 1>; Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -250,6 +250,8 @@ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -1699,6 +1701,8 @@ X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), Index: llvm/trunk/test/CodeGen/X86/avg.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avg.ll +++ llvm/trunk/test/CodeGen/X86/avg.ll @@ -0,0 +1,627 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW + +define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: avg_v4i8 +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd (%rsi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovd (%rdi), %xmm0 +; AVX2-NEXT: vmovd (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = load <4 x i8>, <4 x i8>* %b + %3 = zext <4 x i8> %1 to <4 x i32> + %4 = zext <4 x i8> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, + %6 = add nuw nsw <4 x i32> %5, %4 + %7 = lshr <4 x i32> %6, + %8 = trunc <4 x i32> %7 to <4 x i8> + store <4 x i8> %8, <4 x i8>* undef, align 4 + ret void +} + +define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) { +; SSE2-LABEL: avg_v8i8 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = load <8 x i8>, <8 x i8>* %b + %3 = zext <8 x i8> %1 to <8 x i32> + %4 = zext <8 x i8> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, + %6 = add nuw nsw <8 x i32> %5, %4 + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i8> + store <8 x i8> %8, <8 x i8>* undef, align 4 + ret void +} + +define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { +; SSE2-LABEL: avg_v16i8 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: pavgb (%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v16i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + %3 = zext <16 x i8> %1 to <16 x i32> + %4 = zext <16 x i8> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, + %6 = add nuw nsw <16 x i32> %5, %4 + %7 = lshr <16 x i32> %6, + %8 = trunc <16 x i32> %7 to <16 x i8> + store <16 x i8> %8, <16 x i8>* undef, align 4 + ret void +} + +define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { +; AVX2-LABEL: avg_v32i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <32 x i8>, <32 x i8>* %a + %2 = load <32 x i8>, <32 x i8>* %b + %3 = zext <32 x i8> %1 to <32 x i32> + %4 = zext <32 x i8> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, + %6 = add nuw nsw <32 x i32> %5, %4 + %7 = lshr <32 x i32> %6, + %8 = trunc <32 x i32> %7 to <32 x i8> + store <32 x i8> %8, <32 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { +; AVX512BW-LABEL: avg_v64i8 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %3, + %6 = add nuw nsw <64 x i32> %5, %4 + %7 = lshr <64 x i32> %6, + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + +define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) { +; SSE2-LABEL: avg_v4i16 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgw %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i16 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = load <4 x i16>, <4 x i16>* %b + %3 = zext <4 x i16> %1 to <4 x i32> + %4 = zext <4 x i16> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, + %6 = add nuw nsw <4 x i32> %5, %4 + %7 = lshr <4 x i32> %6, + %8 = trunc <4 x i32> %7 to <4 x i16> + store <4 x i16> %8, <4 x i16>* undef, align 4 + ret void +} + +define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { +; SSE2-LABEL: avg_v8i16 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: pavgw (%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i16 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + %3 = zext <8 x i16> %1 to <8 x i32> + %4 = zext <8 x i16> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, + %6 = add nuw nsw <8 x i32> %5, %4 + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i16> + store <8 x i16> %8, <8 x i16>* undef, align 4 + ret void +} + +define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { +; AVX2-LABEL: avg_v16i16 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = load <16 x i16>, <16 x i16>* %b + %3 = zext <16 x i16> %1 to <16 x i32> + %4 = zext <16 x i16> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, + %6 = add nuw nsw <16 x i32> %5, %4 + %7 = lshr <16 x i32> %6, + %8 = trunc <16 x i32> %7 to <16 x i16> + store <16 x i16> %8, <16 x i16>* undef, align 4 + ret void +} + +define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { +; AVX512BW-LABEL: avg_v32i16 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <32 x i16>, <32 x i16>* %a + %2 = load <32 x i16>, <32 x i16>* %b + %3 = zext <32 x i16> %1 to <32 x i32> + %4 = zext <32 x i16> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, + %6 = add nuw nsw <32 x i32> %5, %4 + %7 = lshr <32 x i32> %6, + %8 = trunc <32 x i32> %7 to <32 x i16> + store <32 x i16> %8, <32 x i16>* undef, align 4 + ret void +} + +define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: avg_v4i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: movd (%rsi), %xmm1 +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovd (%rdi), %xmm0 +; AVX2-NEXT: vmovd (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = load <4 x i8>, <4 x i8>* %b + %3 = zext <4 x i8> %1 to <4 x i32> + %4 = zext <4 x i8> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, %4 + %6 = add nuw nsw <4 x i32> %5, + %7 = lshr <4 x i32> %6, + %8 = trunc <4 x i32> %7 to <4 x i8> + store <4 x i8> %8, <4 x i8>* undef, align 4 + ret void +} + +define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) { +; SSE2-LABEL: avg_v8i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = load <8 x i8>, <8 x i8>* %b + %3 = zext <8 x i8> %1 to <8 x i32> + %4 = zext <8 x i8> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, %4 + %6 = add nuw nsw <8 x i32> %5, + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i8> + store <8 x i8> %8, <8 x i8>* undef, align 4 + ret void +} + +define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { +; SSE2-LABEL: avg_v16i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + %3 = zext <16 x i8> %1 to <16 x i32> + %4 = zext <16 x i8> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, %4 + %6 = add nuw nsw <16 x i32> %5, + %7 = lshr <16 x i32> %6, + %8 = trunc <16 x i32> %7 to <16 x i8> + store <16 x i8> %8, <16 x i8>* undef, align 4 + ret void +} + +define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { +; AVX2-LABEL: avg_v32i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <32 x i8>, <32 x i8>* %a + %2 = load <32 x i8>, <32 x i8>* %b + %3 = zext <32 x i8> %1 to <32 x i32> + %4 = zext <32 x i8> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, %4 + %6 = add nuw nsw <32 x i32> %5, + %7 = lshr <32 x i32> %6, + %8 = trunc <32 x i32> %7 to <32 x i8> + store <32 x i8> %8, <32 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) { +; AVX512BW-LABEL: avg_v64i8_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %4, %4 + %6 = add nuw nsw <64 x i32> %5, + %7 = lshr <64 x i32> %6, + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + + +define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) { +; SSE2-LABEL: avg_v4i16_2 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgw %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = load <4 x i16>, <4 x i16>* %b + %3 = zext <4 x i16> %1 to <4 x i32> + %4 = zext <4 x i16> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, %4 + %6 = add nuw nsw <4 x i32> %5, + %7 = lshr <4 x i32> %6, + %8 = trunc <4 x i32> %7 to <4 x i16> + store <4 x i16> %8, <4 x i16>* undef, align 4 + ret void +} + +define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { +; SSE2-LABEL: avg_v8i16_2 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + %3 = zext <8 x i16> %1 to <8 x i32> + %4 = zext <8 x i16> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, %4 + %6 = add nuw nsw <8 x i32> %5, + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i16> + store <8 x i16> %8, <8 x i16>* undef, align 4 + ret void +} + +define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { +; AVX2-LABEL: avg_v16i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = load <16 x i16>, <16 x i16>* %b + %3 = zext <16 x i16> %1 to <16 x i32> + %4 = zext <16 x i16> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, %4 + %6 = add nuw nsw <16 x i32> %5, + %7 = lshr <16 x i32> %6, + %8 = trunc <16 x i32> %7 to <16 x i16> + store <16 x i16> %8, <16 x i16>* undef, align 4 + ret void +} + +define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { +; AVX512BW-LABEL: avg_v32i16_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <32 x i16>, <32 x i16>* %a + %2 = load <32 x i16>, <32 x i16>* %b + %3 = zext <32 x i16> %1 to <32 x i32> + %4 = zext <32 x i16> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, %4 + %6 = add nuw nsw <32 x i32> %5, + %7 = lshr <32 x i32> %6, + %8 = trunc <32 x i32> %7 to <32 x i16> + store <32 x i16> %8, <32 x i16>* undef, align 4 + ret void +} + +define void @avg_v4i8_const(<4 x i8>* %a) { +; SSE2-LABEL: avg_v4i8_const +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pavgb {{.*}}, %xmm0 +; SSE2-NEXT: movd %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovd (%rdi), %xmm0 +; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = add nuw nsw <4 x i32> %2, + %4 = lshr <4 x i32> %3, + %5 = trunc <4 x i32> %4 to <4 x i8> + store <4 x i8> %5, <4 x i8>* undef, align 4 + ret void +} + +define void @avg_v8i8_const(<8 x i8>* %a) { +; SSE2-LABEL: avg_v8i8_const +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: pavgb {{.*}}, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = add nuw nsw <8 x i32> %2, + %4 = lshr <8 x i32> %3, + %5 = trunc <8 x i32> %4 to <8 x i8> + store <8 x i8> %5, <8 x i8>* undef, align 4 + ret void +} + +define void @avg_v16i8_const(<16 x i8>* %a) { +; SSE2-LABEL: avg_v16i8_const +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgb {{.*}}, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = zext <16 x i8> %1 to <16 x i32> + %3 = add nuw nsw <16 x i32> %2, + %4 = lshr <16 x i32> %3, + %5 = trunc <16 x i32> %4 to <16 x i8> + store <16 x i8> %5, <16 x i8>* undef, align 4 + ret void +} + +define void @avg_v32i8_const(<32 x i8>* %a) { +; AVX2-LABEL: avg_v32i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb {{.*}}, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <32 x i8>, <32 x i8>* %a + %2 = zext <32 x i8> %1 to <32 x i32> + %3 = add nuw nsw <32 x i32> %2, + %4 = lshr <32 x i32> %3, + %5 = trunc <32 x i32> %4 to <32 x i8> + store <32 x i8> %5, <32 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8_const(<64 x i8>* %a) { +; AVX512BW-LABEL: avg_v64i8_const +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgb {{.*}}, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <64 x i8>, <64 x i8>* %a + %2 = zext <64 x i8> %1 to <64 x i32> + %3 = add nuw nsw <64 x i32> %2, + %4 = lshr <64 x i32> %3, + %5 = trunc <64 x i32> %4 to <64 x i8> + store <64 x i8> %5, <64 x i8>* undef, align 4 + ret void +} + +define void @avg_v4i16_const(<4 x i16>* %a) { +; SSE2-LABEL: avg_v4i16_const +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: pavgw {{.*}}, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i16_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = add nuw nsw <4 x i32> %2, + %4 = lshr <4 x i32> %3, + %5 = trunc <4 x i32> %4 to <4 x i16> + store <4 x i16> %5, <4 x i16>* undef, align 4 + ret void +} + +define void @avg_v8i16_const(<8 x i16>* %a) { +; SSE2-LABEL: avg_v8i16_const +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgw {{.*}}, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = zext <8 x i16> %1 to <8 x i32> + %3 = add nuw nsw <8 x i32> %2, + %4 = lshr <8 x i32> %3, + %5 = trunc <8 x i32> %4 to <8 x i16> + store <8 x i16> %5, <8 x i16>* undef, align 4 + ret void +} + +define void @avg_v16i16_const(<16 x i16>* %a) { +; AVX2-LABEL: avg_v16i16_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgw {{.*}}, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = zext <16 x i16> %1 to <16 x i32> + %3 = add nuw nsw <16 x i32> %2, + %4 = lshr <16 x i32> %3, + %5 = trunc <16 x i32> %4 to <16 x i16> + store <16 x i16> %5, <16 x i16>* undef, align 4 + ret void +} + +define void @avg_v32i16_const(<32 x i16>* %a) { +; AVX512BW-LABEL: avg_v32i16_const +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw {{.*}}, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; + %1 = load <32 x i16>, <32 x i16>* %a + %2 = zext <32 x i16> %1 to <32 x i32> + %3 = add nuw nsw <32 x i32> %2, + %4 = lshr <32 x i32> %3, + %5 = trunc <32 x i32> %4 to <32 x i16> + store <32 x i16> %5, <32 x i16>* undef, align 4 + ret void +}