Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -379,12 +379,6 @@ def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem, Commutative]>; def int_x86_sse2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw128">, Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem, Commutative]>; @@ -1678,12 +1672,6 @@ def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">, Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem, Commutative]>; @@ -4947,24 +4935,6 @@ def int_x86_avx512_mask_pmulh_w_256 : GCCBuiltin<"__builtin_ia32_pmulhw256_mask">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pavg_b_512 : GCCBuiltin<"__builtin_ia32_pavgb512_mask">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, - llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pavg_w_512 : GCCBuiltin<"__builtin_ia32_pavgw512_mask">, - Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, - llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pavg_b_128 : GCCBuiltin<"__builtin_ia32_pavgb128_mask">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, - llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pavg_b_256 : GCCBuiltin<"__builtin_ia32_pavgb256_mask">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, - llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pavg_w_128 : GCCBuiltin<"__builtin_ia32_pavgw128_mask">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, - llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pavg_w_256 : GCCBuiltin<"__builtin_ia32_pavgw256_mask">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, - llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmaddw_d_128 : GCCBuiltin<"__builtin_ia32_pmaddwd128_mask">, Intrinsic<[llvm_v4i32_ty], Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -252,7 +252,10 @@ Name.startswith("avx512.mask.move.s") || // Added in 4.0 Name.startswith("avx512.cvtmask2") || // Added in 5.0 (Name.startswith("xop.vpcom") && // Added in 3.2 - F->arg_size() == 2)) + F->arg_size() == 2) || + Name.startswith("sse2.pavg") || // Added in 6.0 + Name.startswith("avx2.pavg") || // Added in 6.0 + Name.startswith("avx512.mask.pavg")) // Added in 6.0 return true; return false; @@ -1972,6 +1975,25 @@ LoadInst *LI = Builder.CreateAlignedLoad(BC, VTy->getBitWidth() / 8); LI->setMetadata(M->getMDKindID("nontemporal"), Node); Rep = LI; + } else if (IsX86 && + (Name.startswith("sse2.pavg") || Name.startswith("avx2.pavg") || + Name.startswith("avx512.mask.pavg"))) { + // llvm.x86.sse2.pavg.b/w, llvm.x86.avx2.pavg.b/w, + // llvm.x86.avx512.mask.pavg.b/w + Value *A = CI->getArgOperand(0); + Value *B = CI->getArgOperand(1); + VectorType *ZextType = VectorType::getExtendedElementVectorType( + cast(A->getType())); + Value *ExtendedA = Builder.CreateZExt(A, ZextType); + Value *ExtendedB = Builder.CreateZExt(B, ZextType); + Value *Sum = Builder.CreateAdd(ExtendedA, ExtendedB); + Value *AddOne = Builder.CreateAdd(Sum, ConstantInt::get(ZextType, 1)); + Value *ShiftR = Builder.CreateLShr(AddOne, ConstantInt::get(ZextType, 1)); + Rep = Builder.CreateTrunc(ShiftR, A->getType()); + if (CI->getNumArgOperands() > 2) { + Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, + CI->getArgOperand(2)); + } } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) { Value *Arg = CI->getArgOperand(0); Value *Neg = Builder.CreateNeg(Arg, "neg"); Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -383,8 +383,6 @@ X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -818,12 +816,6 @@ X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_b_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_b_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_b_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_128, INTR_TYPE_1OP_MASK, X86ISD::VBROADCAST, 0), X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_256, INTR_TYPE_1OP_MASK, @@ -1593,8 +1585,6 @@ X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(sse2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), Index: llvm/trunk/test/CodeGen/X86/avg-mask.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avg-mask.ll +++ llvm/trunk/test/CodeGen/X86/avg-mask.ll @@ -0,0 +1,449 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL + +define <16 x i8> @avg_v16i8_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %src, i16 %mask) nounwind { +; AVX512F-LABEL: avg_v16i8_mask: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v16i8_mask: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovd %edi, %k1 +; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} +; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512BWVL-NEXT: retq + %za = zext <16 x i8> %a to <16 x i16> + %zb = zext <16 x i8> %b to <16 x i16> + %add = add nuw nsw <16 x i16> %za, %zb + %add1 = add nuw nsw <16 x i16> %add, + %lshr = lshr <16 x i16> %add1, + %trunc = trunc <16 x i16> %lshr to <16 x i8> + %mask1 = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> %src + ret <16 x i8> %res +} + +define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwind { +; AVX512F-LABEL: avg_v16i8_maskz: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v16i8_maskz: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovd %edi, %k1 +; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 {%k1} {z} +; AVX512BWVL-NEXT: retq + %za = zext <16 x i8> %a to <16 x i16> + %zb = zext <16 x i8> %b to <16 x i16> + %add = add nuw nsw <16 x i16> %za, %zb + %add1 = add nuw nsw <16 x i16> %add, + %lshr = lshr <16 x i16> %add1, + %trunc = trunc <16 x i16> %lshr to <16 x i8> + %mask1 = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> zeroinitializer + ret <16 x i8> %res +} + +define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind { +; AVX512F-LABEL: avg_v32i8_mask: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $32, %rsp +; AVX512F-NEXT: movl %edi, (%rsp) +; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: kmovw (%rsp), %k1 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v32i8_mask: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovd %edi, %k1 +; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} +; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512BWVL-NEXT: retq + %za = zext <32 x i8> %a to <32 x i16> + %zb = zext <32 x i8> %b to <32 x i16> + %add = add nuw nsw <32 x i16> %za, %zb + %add1 = add nuw nsw <32 x i16> %add, + %lshr = lshr <32 x i16> %add1, + %trunc = trunc <32 x i16> %lshr to <32 x i8> + %mask1 = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> %src + ret <32 x i8> %res +} + +define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind { +; AVX512F-LABEL: avg_v32i8_maskz: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $32, %rsp +; AVX512F-NEXT: movl %edi, (%rsp) +; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: kmovw (%rsp), %k1 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v32i8_maskz: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovd %edi, %k1 +; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 {%k1} {z} +; AVX512BWVL-NEXT: retq + %za = zext <32 x i8> %a to <32 x i16> + %zb = zext <32 x i8> %b to <32 x i16> + %add = add nuw nsw <32 x i16> %za, %zb + %add1 = add nuw nsw <32 x i16> %add, + %lshr = lshr <32 x i16> %add1, + %trunc = trunc <32 x i16> %lshr to <32 x i8> + %mask1 = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> zeroinitializer + ret <32 x i8> %res +} + +define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind { +; AVX512F-LABEL: avg_v64i8_mask: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $64, %rsp +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: shrq $32, %rax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %edi, (%rsp) +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-NEXT: vpavgb %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512F-NEXT: vpavgb %xmm7, %xmm8, %xmm7 +; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: kmovw (%rsp), %k1 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v64i8_mask: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovq %rdi, %k1 +; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BWVL-NEXT: retq + %za = zext <64 x i8> %a to <64 x i16> + %zb = zext <64 x i8> %b to <64 x i16> + %add = add nuw nsw <64 x i16> %za, %zb + %add1 = add nuw nsw <64 x i16> %add, + %lshr = lshr <64 x i16> %add1, + %trunc = trunc <64 x i16> %lshr to <64 x i8> + %mask1 = bitcast i64 %mask to <64 x i1> + %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> %src + ret <64 x i8> %res +} + +define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind { +; AVX512F-LABEL: avg_v64i8_maskz: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $64, %rsp +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: shrq $32, %rax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %edi, (%rsp) +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5 +; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: kmovw (%rsp), %k1 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v64i8_maskz: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovq %rdi, %k1 +; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BWVL-NEXT: retq + %za = zext <64 x i8> %a to <64 x i16> + %zb = zext <64 x i8> %b to <64 x i16> + %add = add nuw nsw <64 x i16> %za, %zb + %add1 = add nuw nsw <64 x i16> %add, + %lshr = lshr <64 x i16> %add1, + %trunc = trunc <64 x i16> %lshr to <64 x i8> + %mask1 = bitcast i64 %mask to <64 x i1> + %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> zeroinitializer + ret <64 x i8> %res +} + +define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 %mask) nounwind { +; AVX512F-LABEL: avg_v8i16_mask: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v8i16_mask: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovd %edi, %k1 +; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} +; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512BWVL-NEXT: retq + %za = zext <8 x i16> %a to <8 x i32> + %zb = zext <8 x i16> %b to <8 x i32> + %add = add nuw nsw <8 x i32> %za, %zb + %add1 = add nuw nsw <8 x i32> %add, + %lshr = lshr <8 x i32> %add1, + %trunc = trunc <8 x i32> %lshr to <8 x i16> + %mask1 = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> %src + ret <8 x i16> %res +} + +define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind { +; AVX512F-LABEL: avg_v8i16_maskz: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v8i16_maskz: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovd %edi, %k1 +; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 {%k1} {z} +; AVX512BWVL-NEXT: retq + %za = zext <8 x i16> %a to <8 x i32> + %zb = zext <8 x i16> %b to <8 x i32> + %add = add nuw nsw <8 x i32> %za, %zb + %add1 = add nuw nsw <8 x i32> %add, + %lshr = lshr <8 x i32> %add1, + %trunc = trunc <8 x i32> %lshr to <8 x i16> + %mask1 = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> zeroinitializer + ret <8 x i16> %res +} + +define <16 x i16> @avg_v16i16_mask(<16 x i16> %a, <16 x i16> %b, <16 x i16> %src, i16 %mask) nounwind { +; AVX512F-LABEL: avg_v16i16_mask: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpandn %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v16i16_mask: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovd %edi, %k1 +; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512BWVL-NEXT: retq + %za = zext <16 x i16> %a to <16 x i32> + %zb = zext <16 x i16> %b to <16 x i32> + %add = add nuw nsw <16 x i32> %za, %zb + %add1 = add nuw nsw <16 x i32> %add, + %lshr = lshr <16 x i32> %add1, + %trunc = trunc <16 x i32> %lshr to <16 x i16> + %mask1 = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> %src + ret <16 x i16> %res +} + +define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nounwind { +; AVX512F-LABEL: avg_v16i16_maskz: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v16i16_maskz: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovd %edi, %k1 +; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 {%k1} {z} +; AVX512BWVL-NEXT: retq + %za = zext <16 x i16> %a to <16 x i32> + %zb = zext <16 x i16> %b to <16 x i32> + %add = add nuw nsw <16 x i32> %za, %zb + %add1 = add nuw nsw <16 x i32> %add, + %lshr = lshr <16 x i32> %add1, + %trunc = trunc <16 x i32> %lshr to <16 x i16> + %mask1 = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> zeroinitializer + ret <16 x i16> %res +} + +define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind { +; AVX512F-LABEL: avg_v32i16_mask: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $32, %rsp +; AVX512F-NEXT: movl %edi, (%rsp) +; AVX512F-NEXT: kmovw (%rsp), %k1 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm6, %xmm6 +; AVX512F-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm7, %xmm7 +; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero +; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2 +; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero +; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2 +; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v32i16_mask: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovd %edi, %k1 +; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BWVL-NEXT: retq + %za = zext <32 x i16> %a to <32 x i32> + %zb = zext <32 x i16> %b to <32 x i32> + %add = add nuw nsw <32 x i32> %za, %zb + %add1 = add nuw nsw <32 x i32> %add, + %lshr = lshr <32 x i32> %add1, + %trunc = trunc <32 x i32> %lshr to <32 x i16> + %mask1 = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> %src + ret <32 x i16> %res +} + +define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind { +; AVX512F-LABEL: avg_v32i16_maskz: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $32, %rsp +; AVX512F-NEXT: movl %edi, (%rsp) +; AVX512F-NEXT: kmovw (%rsp), %k1 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 +; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm5, %xmm5 +; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero +; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2 +; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2 +; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; +; AVX512BWVL-LABEL: avg_v32i16_maskz: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: kmovd %edi, %k1 +; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BWVL-NEXT: retq + %za = zext <32 x i16> %a to <32 x i32> + %zb = zext <32 x i16> %b to <32 x i32> + %add = add nuw nsw <32 x i32> %za, %zb + %add1 = add nuw nsw <32 x i32> %add, + %lshr = lshr <32 x i32> %add1, + %trunc = trunc <32 x i32> %lshr to <32 x i16> + %mask1 = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> zeroinitializer + ret <32 x i16> %res +} Index: llvm/trunk/test/CodeGen/X86/avg.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avg.ll +++ llvm/trunk/test/CodeGen/X86/avg.ll @@ -5,7 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW -define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) { +define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v4i8: ; SSE2: # BB#0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -33,7 +33,7 @@ ret void } -define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) { +define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v8i8: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero @@ -61,7 +61,7 @@ ret void } -define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { +define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v16i8: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rsi), %xmm0 @@ -87,7 +87,7 @@ ret void } -define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { +define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v32i8: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm3 @@ -265,7 +265,7 @@ ret void } -define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { +define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v64i8: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm6 @@ -450,8 +450,6 @@ ; AVX1-LABEL: avg_v64i8: ; AVX1: # BB#0: ; AVX1-NEXT: subq $24, %rsp -; AVX1-NEXT: .Lcfi0: -; AVX1-NEXT: .cfi_def_cfa_offset 32 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero @@ -727,7 +725,7 @@ ret void } -define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) { +define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v4i16: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero @@ -755,7 +753,7 @@ ret void } -define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { +define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v8i16: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rsi), %xmm0 @@ -781,7 +779,7 @@ ret void } -define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { +define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v16i16: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm2 @@ -890,7 +888,7 @@ ret void } -define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { +define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v32i16: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm4 @@ -1116,7 +1114,7 @@ ret void } -define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) { +define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v4i8_2: ; SSE2: # BB#0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -1144,7 +1142,7 @@ ret void } -define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) { +define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v8i8_2: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero @@ -1172,7 +1170,7 @@ ret void } -define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { +define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v16i8_2: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 @@ -1198,7 +1196,7 @@ ret void } -define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { +define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm3 @@ -1376,7 +1374,7 @@ ret void } -define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) { +define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v64i8_2: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rsi), %xmm14 @@ -1750,7 +1748,7 @@ } -define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) { +define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v4i16_2: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero @@ -1778,7 +1776,7 @@ ret void } -define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { +define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v8i16_2: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 @@ -1804,7 +1802,7 @@ ret void } -define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { +define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm2 @@ -1913,7 +1911,7 @@ ret void } -define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { +define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v32i16_2: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm4 @@ -2139,7 +2137,7 @@ ret void } -define void @avg_v4i8_const(<4 x i8>* %a) { +define void @avg_v4i8_const(<4 x i8>* %a) nounwind { ; SSE2-LABEL: avg_v4i8_const: ; SSE2: # BB#0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2162,7 +2160,7 @@ ret void } -define void @avg_v8i8_const(<8 x i8>* %a) { +define void @avg_v8i8_const(<8 x i8>* %a) nounwind { ; SSE2-LABEL: avg_v8i8_const: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero @@ -2185,7 +2183,7 @@ ret void } -define void @avg_v16i8_const(<16 x i8>* %a) { +define void @avg_v16i8_const(<16 x i8>* %a) nounwind { ; SSE2-LABEL: avg_v16i8_const: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 @@ -2208,7 +2206,7 @@ ret void } -define void @avg_v32i8_const(<32 x i8>* %a) { +define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; SSE2-LABEL: avg_v32i8_const: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm5 @@ -2341,7 +2339,7 @@ ret void } -define void @avg_v64i8_const(<64 x i8>* %a) { +define void @avg_v64i8_const(<64 x i8>* %a) nounwind { ; SSE2-LABEL: avg_v64i8_const: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm5 @@ -2661,7 +2659,7 @@ ret void } -define void @avg_v4i16_const(<4 x i16>* %a) { +define void @avg_v4i16_const(<4 x i16>* %a) nounwind { ; SSE2-LABEL: avg_v4i16_const: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero @@ -2684,7 +2682,7 @@ ret void } -define void @avg_v8i16_const(<8 x i16>* %a) { +define void @avg_v8i16_const(<8 x i16>* %a) nounwind { ; SSE2-LABEL: avg_v8i16_const: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 @@ -2707,7 +2705,7 @@ ret void } -define void @avg_v16i16_const(<16 x i16>* %a) { +define void @avg_v16i16_const(<16 x i16>* %a) nounwind { ; SSE2-LABEL: avg_v16i16_const: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm3 @@ -2795,7 +2793,7 @@ ret void } -define void @avg_v32i16_const(<32 x i16>* %a) { +define void @avg_v32i16_const(<32 x i16>* %a) nounwind { ; SSE2-LABEL: avg_v32i16_const: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa (%rdi), %xmm7 @@ -2968,3 +2966,332 @@ store <32 x i16> %5, <32 x i16>* undef, align 4 ret void } + +define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE2-LABEL: avg_v16i8_3: +; SSE2: # BB#0: +; SSE2-NEXT: pavgb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: avg_v16i8_3: +; AVX: # BB#0: +; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %za = zext <16 x i8> %a to <16 x i16> + %zb = zext <16 x i8> %b to <16 x i16> + %add = add nuw nsw <16 x i16> %za, %zb + %add1 = add nuw nsw <16 x i16> %add, + %lshr = lshr <16 x i16> %add1, + %res = trunc <16 x i16> %lshr to <16 x i8> + ret <16 x i8> %res +} + +define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind { +; SSE2-LABEL: avg_v32i8_3: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; SSE2-NEXT: paddw %xmm6, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE2-NEXT: paddw %xmm7, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddw %xmm3, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: psubw %xmm3, %xmm4 +; SSE2-NEXT: psubw %xmm3, %xmm0 +; SSE2-NEXT: psubw %xmm3, %xmm2 +; SSE2-NEXT: psubw %xmm3, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX1-LABEL: avg_v32i8_3: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpaddw %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpaddw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsubw %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v32i8_3: +; AVX2: # BB#0: +; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v32i8_3: +; AVX512: # BB#0: +; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %za = zext <32 x i8> %a to <32 x i16> + %zb = zext <32 x i8> %b to <32 x i16> + %add = add nuw nsw <32 x i16> %za, %zb + %add1 = add nuw nsw <32 x i16> %add, + %lshr = lshr <32 x i16> %add1, + %res = trunc <32 x i16> %lshr to <32 x i8> + ret <32 x i8> %res +} + +define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind { +; SSE2-LABEL: avg_v64i8_3: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm2, %xmm12 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm3, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; SSE2-NEXT: paddw %xmm10, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE2-NEXT: paddw %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] +; SSE2-NEXT: paddw %xmm11, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE2-NEXT: paddw %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE2-NEXT: paddw %xmm12, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE2-NEXT: paddw %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] +; SSE2-NEXT: paddw %xmm13, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE2-NEXT: paddw %xmm7, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 +; SSE2-NEXT: psubw %xmm7, %xmm8 +; SSE2-NEXT: psubw %xmm7, %xmm0 +; SSE2-NEXT: psubw %xmm7, %xmm4 +; SSE2-NEXT: psubw %xmm7, %xmm1 +; SSE2-NEXT: psubw %xmm7, %xmm5 +; SSE2-NEXT: psubw %xmm7, %xmm2 +; SSE2-NEXT: psubw %xmm7, %xmm6 +; SSE2-NEXT: psubw %xmm7, %xmm3 +; SSE2-NEXT: psrlw $1, %xmm3 +; SSE2-NEXT: psrlw $1, %xmm6 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm5 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: packuswb %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: retq +; +; AVX1-LABEL: avg_v64i8_3: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm11 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpaddw %xmm7, %xmm5, %xmm12 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpaddw %xmm1, %xmm4, %xmm13 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpaddw %xmm4, %xmm6, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm15 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpaddw %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpaddw %xmm2, %xmm11, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpaddw %xmm7, %xmm9, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpaddw %xmm3, %xmm10, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsubw %xmm5, %xmm12, %xmm8 +; AVX1-NEXT: vpsubw %xmm5, %xmm13, %xmm4 +; AVX1-NEXT: vpsubw %xmm5, %xmm14, %xmm0 +; AVX1-NEXT: vpsubw %xmm5, %xmm15, %xmm1 +; AVX1-NEXT: vpsubw %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpsubw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsubw %xmm5, %xmm7, %xmm7 +; AVX1-NEXT: vpsubw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm9 +; AVX1-NEXT: vpsrlw $1, %xmm7, %xmm5 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $1, %xmm8, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm7[0],xmm4[0] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v64i8_3: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero +; AVX2-NEXT: vpaddw %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX2-NEXT: vpaddw %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero +; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpsubw %ymm3, %ymm4, %ymm4 +; AVX2-NEXT: vpsubw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubw %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $1, %ymm4, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: avg_v64i8_3: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5 +; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: avg_v64i8_3: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq + %za = zext <64 x i8> %a to <64 x i16> + %zb = zext <64 x i8> %b to <64 x i16> + %add = add nuw nsw <64 x i16> %za, %zb + %add1 = add nuw nsw <64 x i16> %add, + %lshr = lshr <64 x i16> %add1, + %res = trunc <64 x i16> %lshr to <64 x i8> + ret <64 x i8> %res +} Index: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -259,7 +259,7 @@ ret <4 x i64> %res } -define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) { +define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; X32-LABEL: test_mm256_avg_epu8: ; X32: # BB#0: ; X32-NEXT: vpavgb %ymm1, %ymm0, %ymm0 @@ -271,13 +271,17 @@ ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1) + %zext0 = zext <32 x i8> %arg0 to <32 x i16> + %zext1 = zext <32 x i8> %arg1 to <32 x i16> + %add = add <32 x i16> %zext0, %zext1 + %add1 = add <32 x i16> %add, + %lshr = lshr <32 x i16> %add1, + %res = trunc <32 x i16> %lshr to <32 x i8> %bc = bitcast <32 x i8> %res to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone -define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) { +define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; X32-LABEL: test_mm256_avg_epu16: ; X32: # BB#0: ; X32-NEXT: vpavgw %ymm1, %ymm0, %ymm0 @@ -289,11 +293,15 @@ ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1) + %zext0 = zext <16 x i16> %arg0 to <16 x i32> + %zext1 = zext <16 x i16> %arg1 to <16 x i32> + %add = add <16 x i32> %zext0, %zext1 + %add1 = add <16 x i32> %add, + %lshr = lshr <16 x i32> %add1, + %res = trunc <16 x i32> %lshr to <16 x i16> %bc = bitcast <16 x i16> %res to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_blend_epi16: Index: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -514,3 +514,23 @@ } declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone +define <32 x i8> @mm256_avg_epu8(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK-LABEL: mm256_avg_epu8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl + %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <16 x i16> @mm256_avg_epu16(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK-LABEL: mm256_avg_epu16: +; CHECK: ## BB#0: +; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl + %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone + Index: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -114,38 +114,6 @@ declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone -define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) { -; AVX2-LABEL: test_x86_avx2_pavg_b: -; AVX2: ## BB#0: -; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe0,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_pavg_b: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) { -; AVX2-LABEL: test_x86_avx2_pavg_w: -; AVX2: ## BB#0: -; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe3,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_pavg_w: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone - - define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) { ; AVX2-LABEL: test_x86_avx2_pmadd_wd: ; AVX2: ## BB#0: @@ -1340,18 +1308,18 @@ ; AVX2: ## BB#0: ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4 -; AVX2-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4 +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 +; AVX2-NEXT: vpsravd LCPI88_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vmovdqa LCPI90_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] +; AVX512VL-NEXT: vmovdqa LCPI88_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4 -; AVX512VL-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4 +; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 +; AVX512VL-NEXT: vpsravd LCPI88_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 ; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) ret <4 x i32> %res @@ -1377,18 +1345,18 @@ ; AVX2: ## BB#0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; AVX2-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI92_0, kind: FK_Data_4 -; AVX2-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI92_1, kind: FK_Data_4 +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4 +; AVX2-NEXT: vpsravd LCPI90_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4 ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vmovdqa LCPI92_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; AVX512VL-NEXT: vmovdqa LCPI90_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI92_0, kind: FK_Data_4 -; AVX512VL-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI92_1, kind: FK_Data_4 +; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4 +; AVX512VL-NEXT: vpsravd LCPI90_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4 ; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) ret <8 x i32> %res Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -3610,3 +3610,54 @@ } declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone + + +declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) + +define <64 x i8>@mm512_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; AVX512BW-LABEL: mm512_avg_epu8: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: mm512_avg_epu8: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm3 +; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vpaddb %zmm3, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) + %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + %res2 = add <64 x i8> %res, %res1 + ret <64 x i8> %res2 +} + + +declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@mm512_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: mm512_avg_epu16: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: mm512_avg_epu16: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm3 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1073,56 +1073,6 @@ ret <32 x i16> %res2 } -declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) - -define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { -; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_b_512: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: kmovq %rdi, %k1 -; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_b_512: -; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 -; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 -} - -declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - -define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { -; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_w_512: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_w_512: -; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 -} - declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>) define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) { Index: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -3445,3 +3445,67 @@ } declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone + +define <16 x i8>@mm_mask_avg_epu8(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { +; CHECK-LABEL: mm_mask_avg_epu8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xd9] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1] +; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + %res2 = add <16 x i8> %res, %res1 + ret <16 x i8> %res2 +} + +declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8>@mm256_mask_avg_epu8(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { +; CHECK-LABEL: mm256_mask_avg_epu8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xd9] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1] +; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + %res2 = add <32 x i8> %res, %res1 + ret <32 x i8> %res2 +} + +declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <8 x i16>@mm_mask_avg_epu16(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: mm_mask_avg_epu16: +; CHECK: ## BB#0: +; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xd9] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16>@mm256_mask_avg_epu16(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: mm256_mask_avg_epu16: +; CHECK: ## BB#0: +; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xd9] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1] +; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) Index: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1900,70 +1900,6 @@ ret <16 x i16> %res2 } -declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - -define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1] -; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1] -; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 -} - -declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - -define <32 x i8>@test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1] -; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1] -; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 -} - -declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - -define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1] -; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1] -; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 -} - -declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - -define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1] -; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1] -; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 -} - declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16) define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) { Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -252,11 +252,15 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) + %zext0 = zext <16 x i8> %arg0 to <16 x i16> + %zext1 = zext <16 x i8> %arg1 to <16 x i16> + %add = add <16 x i16> %zext0, %zext1 + %add1 = add <16 x i16> %add, + %lshr = lshr <16 x i16> %add1, + %res = trunc <16 x i16> %lshr to <16 x i8> %bc = bitcast <16 x i8> %res to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_avg_epu16: @@ -270,11 +274,15 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %arg0, <8 x i16> %arg1) + %zext0 = zext <8 x i16> %arg0 to <8 x i32> + %zext1 = zext <8 x i16> %arg1 to <8 x i32> + %add = add <8 x i32> %zext0, %zext1 + %add1 = add <8 x i32> %add, + %lshr = lshr <8 x i32> %add1, + %res = trunc <8 x i32> %lshr to <8 x i16> %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind { ; X32-LABEL: test_mm_bslli_si128: Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -282,5 +282,24 @@ } declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone +define <16 x i8> @mm_avg_epu8(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: mm_avg_epu8: +; CHECK: ## BB#0: +; CHECK-NEXT: pavgb %xmm1, %xmm0 +; CHECK-NEXT: retl + %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @mm_avg_epu16(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: mm_avg_epu16: +; CHECK: ## BB#0: +; CHECK-NEXT: pavgw %xmm1, %xmm0 +; CHECK-NEXT: retl + %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -809,48 +809,6 @@ declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone -define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_pavg_b: -; SSE: ## BB#0: -; SSE-NEXT: pavgb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe0,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_pavg_b: -; AVX2: ## BB#0: -; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe0,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_pavg_b: -; SKX: ## BB#0: -; SKX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_pavg_w: -; SSE: ## BB#0: -; SSE-NEXT: pavgw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe3,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_pavg_w: -; AVX2: ## BB#0: -; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe3,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_pavg_w: -; SKX: ## BB#0: -; SKX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone - - define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) { ; SSE-LABEL: test_x86_sse2_pmadd_wd: ; SSE: ## BB#0: Index: llvm/trunk/test/CodeGen/X86/sse2-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-schedule.ll +++ llvm/trunk/test/CodeGen/X86/sse2-schedule.ll @@ -3978,12 +3978,21 @@ ; ZNVER1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) - %2 = load <16 x i8>, <16 x i8> *%a2, align 16 - %3 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %1, <16 x i8> %2) - ret <16 x i8> %3 + %1 = zext <16 x i8> %a0 to <16 x i16> + %2 = zext <16 x i8> %a1 to <16 x i16> + %3 = add <16 x i16> %1, %2 + %4 = add <16 x i16> %3, + %5 = lshr <16 x i16> %4, + %6 = trunc <16 x i16> %5 to <16 x i8> + %7 = load <16 x i8>, <16 x i8> *%a2, align 16 + %8 = zext <16 x i8> %6 to <16 x i16> + %9 = zext <16 x i8> %7 to <16 x i16> + %10 = add <16 x i16> %8, %9 + %11 = add <16 x i16> %10, + %12 = lshr <16 x i16> %11, + %13 = trunc <16 x i16> %12 to <16 x i8> + ret <16 x i8> %13 } -declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_pavgw: @@ -4037,12 +4046,21 @@ ; ZNVER1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) - %2 = load <8 x i16>, <8 x i16> *%a2, align 16 - %3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %1, <8 x i16> %2) - ret <8 x i16> %3 + %1 = zext <8 x i16> %a0 to <8 x i32> + %2 = zext <8 x i16> %a1 to <8 x i32> + %3 = add <8 x i32> %1, %2 + %4 = add <8 x i32> %3, + %5 = lshr <8 x i32> %4, + %6 = trunc <8 x i32> %5 to <8 x i16> + %7 = load <8 x i16>, <8 x i16> *%a2, align 16 + %8 = zext <8 x i16> %6 to <8 x i32> + %9 = zext <8 x i16> %7 to <8 x i32> + %10 = add <8 x i32> %8, %9 + %11 = add <8 x i32> %10, + %12 = lshr <8 x i32> %11, + %13 = trunc <8 x i32> %12 to <8 x i16> + ret <8 x i16> %13 } -declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; GENERIC-LABEL: test_pcmpeqb: Index: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx1.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx1.ll +++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx1.ll @@ -275,19 +275,27 @@ ;CHECK-LABEL: stack_fold_pavgb ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) - ret <16 x i8> %2 + %2 = zext <16 x i8> %a0 to <16 x i16> + %3 = zext <16 x i8> %a1 to <16 x i16> + %4 = add <16 x i16> %2, %3 + %5 = add <16 x i16> %4, + %6 = lshr <16 x i16> %5, + %7 = trunc <16 x i16> %6 to <16 x i8> + ret <16 x i8> %7 } -declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) { ;CHECK-LABEL: stack_fold_pavgw ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) - ret <8 x i16> %2 + %2 = zext <8 x i16> %a0 to <8 x i32> + %3 = zext <8 x i16> %a1 to <8 x i32> + %4 = add <8 x i32> %2, %3 + %5 = add <8 x i32> %4, + %6 = lshr <8 x i32> %5, + %7 = trunc <8 x i32> %6 to <8 x i16> + ret <8 x i16> %7 } -declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) { ;CHECK-LABEL: stack_fold_pblendvb Index: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx2.ll +++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx2.ll @@ -234,19 +234,27 @@ ;CHECK-LABEL: stack_fold_pavgb ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) - ret <32 x i8> %2 + %2 = zext <32 x i8> %a0 to <32 x i16> + %3 = zext <32 x i8> %a1 to <32 x i16> + %4 = add <32 x i16> %2, %3 + %5 = add <32 x i16> %4, + %6 = lshr <32 x i16> %5, + %7 = trunc <32 x i16> %6 to <32 x i8> + ret <32 x i8> %7 } -declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone define <16 x i16> @stack_fold_pavgw(<16 x i16> %a0, <16 x i16> %a1) { ;CHECK-LABEL: stack_fold_pavgw ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) - ret <16 x i16> %2 + %2 = zext <16 x i16> %a0 to <16 x i32> + %3 = zext <16 x i16> %a1 to <16 x i32> + %4 = add <16 x i32> %2, %3 + %5 = add <16 x i32> %4, + %6 = lshr <16 x i32> %5, + %7 = trunc <16 x i32> %6 to <16 x i16> + ret <16 x i16> %7 } -declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) { ;CHECK-LABEL: stack_fold_pblendd Index: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll +++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -70,52 +70,88 @@ ;CHECK-LABEL: stack_fold_pavgb ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1) - ret <64 x i8> %2 + %2 = zext <64 x i8> %a0 to <64 x i16> + %3 = zext <64 x i8> %a1 to <64 x i16> + %4 = add <64 x i16> %2, %3 + %5 = add <64 x i16> %4, + %6 = lshr <64 x i16> %5, + %7 = trunc <64 x i16> %6 to <64 x i8> + ret <64 x i8> %7 } -declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone define <64 x i8> @stack_fold_pavgb_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ;CHECK-LABEL: stack_fold_pavgb_mask ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load <64 x i8>, <64 x i8>* %passthru - %3 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %2, i64 %mask) - ret <64 x i8> %3 + %3 = zext <64 x i8> %a0 to <64 x i16> + %4 = zext <64 x i8> %a1 to <64 x i16> + %5 = add <64 x i16> %3, %4 + %6 = add <64 x i16> %5, + %7 = lshr <64 x i16> %6, + %8 = trunc <64 x i16> %7 to <64 x i8> + %9 = bitcast i64 %mask to <64 x i1> + %10 = select <64 x i1> %9, <64 x i8> %8, <64 x i8> %2 + ret <64 x i8> %10 } define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ;CHECK-LABEL: stack_fold_pavgb_maskz ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> zeroinitializer, i64 %mask) - ret <64 x i8> %2 + %2 = zext <64 x i8> %a0 to <64 x i16> + %3 = zext <64 x i8> %a1 to <64 x i16> + %4 = add <64 x i16> %2, %3 + %5 = add <64 x i16> %4, + %6 = lshr <64 x i16> %5, + %7 = trunc <64 x i16> %6 to <64 x i8> + %8 = bitcast i64 %mask to <64 x i1> + %9 = select <64 x i1> %8, <64 x i8> %7, <64 x i8> zeroinitializer + ret <64 x i8> %9 } define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) { ;CHECK-LABEL: stack_fold_pavgw ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1) - ret <32 x i16> %2 + %2 = zext <32 x i16> %a0 to <32 x i32> + %3 = zext <32 x i16> %a1 to <32 x i32> + %4 = add <32 x i32> %2, %3 + %5 = add <32 x i32> %4, + %6 = lshr <32 x i32> %5, + %7 = trunc <32 x i32> %6 to <32 x i16> + ret <32 x i16> %7 } -declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) nounwind readnone define <32 x i16> @stack_fold_pavgw_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ;CHECK-LABEL: stack_fold_pavgw_mask ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load <32 x i16>, <32 x i16>* %passthru - %3 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> %2, i32 %mask) - ret <32 x i16> %3 + %3 = zext <32 x i16> %a0 to <32 x i32> + %4 = zext <32 x i16> %a1 to <32 x i32> + %5 = add <32 x i32> %3, %4 + %6 = add <32 x i32> %5, + %7 = lshr <32 x i32> %6, + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %2 + ret <32 x i16> %10 } define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ;CHECK-LABEL: stack_fold_pavgw_maskz ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %2 + %2 = zext <32 x i16> %a0 to <32 x i32> + %3 = zext <32 x i16> %a1 to <32 x i32> + %4 = add <32 x i32> %2, %3 + %5 = add <32 x i32> %4, + %6 = lshr <32 x i32> %5, + %7 = trunc <32 x i32> %6 to <32 x i16> + %8 = bitcast i32 %mask to <32 x i1> + %9 = select <32 x i1> %8, <32 x i16> %7, <32 x i16> zeroinitializer + ret <32 x i16> %9 } define <4 x i32> @stack_fold_extracti32x4(<16 x i32> %a0, <16 x i32> %a1) { Index: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -49,37 +49,53 @@ ;CHECK-LABEL: stack_fold_pavgb ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) - ret <16 x i8> %2 + %2 = zext <16 x i8> %a0 to <16 x i16> + %3 = zext <16 x i8> %a1 to <16 x i16> + %4 = add <16 x i16> %2, %3 + %5 = add <16 x i16> %4, + %6 = lshr <16 x i16> %5, + %7 = trunc <16 x i16> %6 to <16 x i8> + ret <16 x i8> %7 } -declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone define <32 x i8> @stack_fold_pavgb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ;CHECK-LABEL: stack_fold_pavgb_ymm ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) - ret <32 x i8> %2 + %2 = zext <32 x i8> %a0 to <32 x i16> + %3 = zext <32 x i8> %a1 to <32 x i16> + %4 = add <32 x i16> %2, %3 + %5 = add <32 x i16> %4, + %6 = lshr <32 x i16> %5, + %7 = trunc <32 x i16> %6 to <32 x i8> + ret <32 x i8> %7 } -declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) { ;CHECK-LABEL: stack_fold_pavgw ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) - ret <8 x i16> %2 + %2 = zext <8 x i16> %a0 to <8 x i32> + %3 = zext <8 x i16> %a1 to <8 x i32> + %4 = add <8 x i32> %2, %3 + %5 = add <8 x i32> %4, + %6 = lshr <8 x i32> %5, + %7 = trunc <8 x i32> %6 to <8 x i16> + ret <8 x i16> %7 } -declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ;CHECK-LABEL: stack_fold_pavgw_ymm ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) - ret <16 x i16> %2 + %2 = zext <16 x i16> %a0 to <16 x i32> + %3 = zext <16 x i16> %a1 to <16 x i32> + %4 = add <16 x i32> %2, %3 + %5 = add <16 x i32> %4, + %6 = lshr <16 x i32> %5, + %7 = trunc <16 x i32> %6 to <16 x i16> + ret <16 x i16> %7 } -declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) { ;CHECK-LABEL: stack_fold_vpconflictd Index: llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll +++ llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll @@ -302,19 +302,27 @@ ;CHECK-LABEL: stack_fold_pavgb ;CHECK: pavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) - ret <16 x i8> %2 + %2 = zext <16 x i8> %a0 to <16 x i16> + %3 = zext <16 x i8> %a1 to <16 x i16> + %4 = add <16 x i16> %2, %3 + %5 = add <16 x i16> %4, + %6 = lshr <16 x i16> %5, + %7 = trunc <16 x i16> %6 to <16 x i8> + ret <16 x i8> %7 } -declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) { ;CHECK-LABEL: stack_fold_pavgw ;CHECK: pavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) - ret <8 x i16> %2 + %2 = zext <8 x i16> %a0 to <8 x i32> + %3 = zext <8 x i16> %a1 to <8 x i32> + %4 = add <8 x i32> %2, %3 + %5 = add <8 x i32> %4, + %6 = lshr <8 x i32> %5, + %7 = trunc <8 x i32> %6 to <8 x i16> + ret <8 x i16> %7 } -declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) { ;CHECK-LABEL: stack_fold_pblendvb