Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -1392,80 +1392,6 @@ def int_x86_avx_ptestnzc_256 : GCCBuiltin<"__builtin_ia32_ptestnzc256">, Intrinsic<[llvm_i32_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; - def int_x86_avx512_ptestm_d_512 : GCCBuiltin<"__builtin_ia32_ptestmd512">, - Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, - llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_ptestm_q_512 : GCCBuiltin<"__builtin_ia32_ptestmq512">, - Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, - llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_ptestm_b_128 : GCCBuiltin<"__builtin_ia32_ptestmb128">, - Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, - llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_ptestm_b_256 : GCCBuiltin<"__builtin_ia32_ptestmb256">, - Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, - llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_ptestm_b_512 : GCCBuiltin<"__builtin_ia32_ptestmb512">, - Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, - llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_avx512_ptestm_d_128 : GCCBuiltin<"__builtin_ia32_ptestmd128">, - Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, - llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_ptestm_d_256 : GCCBuiltin<"__builtin_ia32_ptestmd256">, - Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, - llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_ptestm_q_128 : GCCBuiltin<"__builtin_ia32_ptestmq128">, - Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, - llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_ptestm_q_256 : GCCBuiltin<"__builtin_ia32_ptestmq256">, - Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, - llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_ptestm_w_128 : GCCBuiltin<"__builtin_ia32_ptestmw128">, - Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, - llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_ptestm_w_256 : GCCBuiltin<"__builtin_ia32_ptestmw256">, - Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_ptestm_w_512 : GCCBuiltin<"__builtin_ia32_ptestmw512">, - Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, - llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_ptestnm_b_128 : GCCBuiltin<"__builtin_ia32_ptestnmb128">, - Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, - llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_ptestnm_b_256 : GCCBuiltin<"__builtin_ia32_ptestnmb256">, - Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, - llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_ptestnm_b_512 : GCCBuiltin<"__builtin_ia32_ptestnmb512">, - Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, - llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_avx512_ptestnm_d_128 : GCCBuiltin<"__builtin_ia32_ptestnmd128">, - Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, - llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_ptestnm_d_256 : GCCBuiltin<"__builtin_ia32_ptestnmd256">, - Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, - llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_ptestnm_d_512 : GCCBuiltin<"__builtin_ia32_ptestnmd512">, - Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, - llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_ptestnm_q_128 : GCCBuiltin<"__builtin_ia32_ptestnmq128">, - Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, - llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_ptestnm_q_256 : GCCBuiltin<"__builtin_ia32_ptestnmq256">, - Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, - llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_ptestnm_q_512 : GCCBuiltin<"__builtin_ia32_ptestnmq512">, - Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, - llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_ptestnm_w_128 : GCCBuiltin<"__builtin_ia32_ptestnmw128">, - Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, - llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_ptestnm_w_256 : GCCBuiltin<"__builtin_ia32_ptestnmw256">, - Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_ptestnm_w_512 : GCCBuiltin<"__builtin_ia32_ptestnmw512">, - Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, - llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_fpclass_pd_128 : GCCBuiltin<"__builtin_ia32_fpclasspd128_mask">, Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -259,6 +259,8 @@ Name.startswith("avx512.cvtmask2") || // Added in 5.0 (Name.startswith("xop.vpcom") && // Added in 3.2 F->arg_size() == 2) || + Name.startswith("avx512.ptestm") || //Added in 6.0 + Name.startswith("avx512.ptestnm") || //Added in 6.0 Name.startswith("sse2.pavg") || // Added in 6.0 Name.startswith("avx2.pavg") || // Added in 6.0 Name.startswith("avx512.mask.pavg")) // Added in 6.0 @@ -826,6 +828,26 @@ return Res; } +// Applying mask on vector of i1's and make sure result is at least 8 bits wide. +static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder,Value *Vec, Value *Mask, + unsigned NumElts) { + const auto *C = dyn_cast(Mask); + if (!C || !C->isAllOnesValue()) + Vec = Builder.CreateAnd(Vec, getX86MaskVec(Builder, Mask, NumElts)); + + if (NumElts < 8) { + uint32_t Indices[8]; + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = i; + for (unsigned i = NumElts; i != 8; ++i) + Indices[i] = NumElts + i % NumElts; + Vec = Builder.CreateShuffleVector(Vec, + Constant::getNullValue(Vec->getType()), + Indices); + } + return Builder.CreateBitCast(Vec, Builder.getIntNTy(std::max(NumElts, 8U))); +} + static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI, unsigned CC, bool Signed) { Value *Op0 = CI.getArgOperand(0); @@ -851,22 +873,8 @@ } Value *Mask = CI.getArgOperand(CI.getNumArgOperands() - 1); - const auto *C = dyn_cast(Mask); - if (!C || !C->isAllOnesValue()) - Cmp = Builder.CreateAnd(Cmp, getX86MaskVec(Builder, Mask, NumElts)); - if (NumElts < 8) { - uint32_t Indices[8]; - for (unsigned i = 0; i != NumElts; ++i) - Indices[i] = i; - for (unsigned i = NumElts; i != 8; ++i) - Indices[i] = NumElts + i % NumElts; - Cmp = Builder.CreateShuffleVector(Cmp, - Constant::getNullValue(Cmp->getType()), - Indices); - } - return Builder.CreateBitCast(Cmp, IntegerType::get(CI.getContext(), - std::max(NumElts, 8U))); + return ApplyX86MaskOn1BitsVec(Builder, Cmp, Mask, NumElts); } // Replace a masked intrinsic with an older unmasked intrinsic. @@ -1038,7 +1046,20 @@ ExtTy->getPrimitiveSizeInBits(); Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy); Rep = Builder.CreateVectorSplat(NumElts, Rep); - } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))) { + } else if (IsX86 && (Name.startswith("avx512.ptestm") || + Name.startswith("avx512.ptestnm"))) { + Value *Op0 = CI->getArgOperand(0); + Value *Op1 = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Rep = Builder.CreateAnd(Op0, Op1); + llvm::Type *Ty = Op0->getType(); + Value *Zero = llvm::Constant::getNullValue(Ty); + ICmpInst::Predicate Pred = + Name.startswith("avx512.ptestm") ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; + Rep = Builder.CreateICmp(Pred, Rep, Zero); + unsigned NumElts = Op0->getType()->getVectorNumElements(); + Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, Mask, NumElts); + } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){ unsigned NumElts = CI->getArgOperand(1)->getType()->getVectorNumElements(); Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0)); Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -1385,30 +1385,6 @@ X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_ptestm_b_128, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_b_256, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_b_512, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_d_128, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_d_256, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_d_512, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_q_128, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_q_256, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_q_512, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_w_128, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_w_256, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_w_512, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_b_128, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_b_256, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_b_512, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_d_128, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_d_256, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_d_512, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_q_128, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_q_256, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_q_512, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0), X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -228,6 +228,164 @@ } +define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_testn_epi32_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestnmd %zmm0, %zmm1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_testn_epi32_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> + %1 = icmp eq <16 x i32> %0, zeroinitializer + %2 = bitcast <16 x i1> %1 to i16 + ret i16 %2 +} + +define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_mask_testn_epi32_mask: +; X32: # BB#0: # %entry +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_testn_epi32_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> + %1 = icmp eq <16 x i32> %0, zeroinitializer + %2 = bitcast i16 %__U to <16 x i1> + %3 = and <16 x i1> %1, %2 + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_testn_epi64_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestnmq %zmm0, %zmm1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_testn_epi64_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 +} + +define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_mask_testn_epi64_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_testn_epi64_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer + %1 = bitcast i8 %__U to <8 x i1> + %2 = and <8 x i1> %0, %1 + %3 = bitcast <8 x i1> %2 to i8 + ret i8 %3 +} + +define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_mask_test_epi32_mask: +; X32: # BB#0: # %entry +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_test_epi32_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> + %1 = icmp ne <16 x i32> %0, zeroinitializer + %2 = bitcast i16 %__U to <16 x i1> + %3 = and <16 x i1> %1, %2 + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_mask_test_epi64_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_test_epi64_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer + %1 = bitcast i8 %__U to <8 x i1> + %2 = and <8 x i1> %0, %1 + %3 = bitcast <8 x i1> %2 to i8 + ret i8 %3 +} define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) { ; X32-LABEL: test_mm512_mask_set1_epi32: Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -3724,3 +3724,77 @@ ret <8 x i64> %res2 } +define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) { +; CHECK-LABEL: test_vptestmq: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %ecx +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: addb %cl, %al +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq + %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1) + %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m) + %res2 = add i8 %res1, %res + ret i8 %res2 +} +declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8) + +define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) { +; CHECK-LABEL: test_vptestmd: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %ecx +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: retq + %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1) + %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m) + %res2 = add i16 %res1, %res + ret i16 %res2 +} +declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16) + +declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2) + +define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k1 {%k1} +; CHECK-NEXT: kmovw %k1, %ecx +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: retq + %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) + %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + +declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2) + +define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k1 {%k1} +; CHECK-NEXT: kmovw %k1, %ecx +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: addb %cl, %al +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq + %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll @@ -766,42 +766,6 @@ declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) -define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) { -; CHECK-LABEL: test_vptestmq: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: addb %cl, %al -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq - %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1) - %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m) - %res2 = add i8 %res1, %res - ret i8 %res2 -} -declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8) - -define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) { -; CHECK-LABEL: test_vptestmd: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: ## kill: %AX %AX %EAX -; CHECK-NEXT: retq - %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1) - %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m) - %res2 = add i16 %res1, %res - ret i16 %res2 -} -declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16) - define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) { ; CHECK-LABEL: test_mask_store_ss: ; CHECK: ## BB#0: @@ -4064,47 +4028,6 @@ ret <2 x double> %res4 } -declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2) - -define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: ## kill: %AX %AX %EAX -; CHECK-NEXT: retq - %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) - %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1) - %res2 = add i16 %res, %res1 - ret i16 %res2 -} - -declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2) - -define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: addb %cl, %al -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq - %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - - - - declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -1904,5 +1904,1655 @@ ret <8 x i64> %res2 } +define i64 @test_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_test_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-8, %esp +; X32-NEXT: subl $8, %esp +; X32-NEXT: vptestmb %zmm0, %zmm1, %k0 +; X32-NEXT: kmovq %k0, (%esp) +; X32-NEXT: movl (%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_test_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestmb %zmm0, %zmm1, %k0 +; X64-NEXT: kmovq %k0, %rax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> + %1 = icmp ne <64 x i8> %0, zeroinitializer + %2 = bitcast <64 x i1> %1 to i64 + ret i64 %2 +} + +define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_mask_test_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: pushl %ebx +; X32-NEXT: andl $-64, %esp +; X32-NEXT: subl $256, %esp # imm = 0x100 +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: vmovaps %zmm1, {{[0-9]+}}(%esp) # 64-byte Spill +; X32-NEXT: vmovaps %zmm0, {{[0-9]+}}(%esp) # 64-byte Spill +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: andb $2, %cl +; X32-NEXT: shrb %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vpsllw $8, %xmm0, %xmm0 +; X32-NEXT: kmovd %eax, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vpbroadcastw %xmm1, %xmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vpslld $24, %xmm1, %xmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $4, %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vpbroadcastd %xmm1, %xmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $5, %cl +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vpsllq $40, %xmm1, %xmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $6, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $7, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $56, %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movb %ah, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastq %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: andb $2, %cl +; X32-NEXT: shrb %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movb %ah, %cl +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $12, %ecx +; X32-NEXT: andl $15, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $13, %ecx +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $14, %ecx +; X32-NEXT: andl $3, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $15, %ecx +; X32-NEXT: andl $1, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $16, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $2, %dl +; X32-NEXT: shrb %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllw $8, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $15, %dl +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: shrb $2, %bl +; X32-NEXT: kmovd %ebx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslld $24, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $4, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $5, %dl +; X32-NEXT: andb $1, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $40, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $6, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $7, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $56, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $24, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastq %xmm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $2, %dl +; X32-NEXT: shrb %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; X32-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $28, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $29, %ecx +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; X32-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $30, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrl $31, %eax +; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; X32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT: vpmovm2b %k1, %zmm7 +; X32-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm6, %ymm1, %ymm7, %ymm1 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: andb $2, %cl +; X32-NEXT: shrb %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllw $8, %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslld $24, %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $4, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $5, %cl +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $40, %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $6, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $7, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $56, %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movb %ah, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastq %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: andb $2, %cl +; X32-NEXT: shrb %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movb %ah, %cl +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $12, %ecx +; X32-NEXT: andl $15, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $13, %ecx +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $14, %ecx +; X32-NEXT: andl $3, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $15, %ecx +; X32-NEXT: andl $1, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $16, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $2, %dl +; X32-NEXT: shrb %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllw $8, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $15, %dl +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: shrb $2, %bl +; X32-NEXT: kmovd %ebx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslld $24, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $4, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $5, %dl +; X32-NEXT: andb $1, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $40, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $6, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $7, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $56, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $24, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastq %xmm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $2, %dl +; X32-NEXT: shrb %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k0 +; X32-NEXT: vpmovb2m %zmm0, %k1 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k1, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm6 +; X32-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $28, %ecx +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm5 +; X32-NEXT: vpbroadcastd %xmm5, %xmm5 +; X32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm5, %ymm1 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $29, %ecx +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm4 +; X32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2] +; X32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; X32-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $30, %ecx +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm3 +; X32-NEXT: vpbroadcastw %xmm3, %xmm3 +; X32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: kshiftlq $1, %k0, %k0 +; X32-NEXT: kshiftrq $1, %k0, %k0 +; X32-NEXT: shrl $31, %eax +; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kshiftlq $63, %k1, %k1 +; X32-NEXT: korq %k1, %k0, %k1 +; X32-NEXT: vmovdqa64 {{[0-9]+}}(%esp), %zmm0 # 64-byte Reload +; X32-NEXT: vmovdqa64 {{[0-9]+}}(%esp), %zmm1 # 64-byte Reload +; X32-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1} +; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %ebx +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_test_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovq %rdi, %k1 +; X64-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: kmovq %k0, %rax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> + %1 = icmp ne <64 x i8> %0, zeroinitializer + %2 = bitcast i64 %__U to <64 x i1> + %3 = and <64 x i1> %1, %2 + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define i32 @test_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_test_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestmw %zmm0, %zmm1, %k0 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_test_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestmw %zmm0, %zmm1, %k0 +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> + %1 = icmp ne <32 x i16> %0, zeroinitializer + %2 = bitcast <32 x i1> %1 to i32 + ret i32 %2 +} + +define i32 @test_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_mask_test_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_test_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> + %1 = icmp ne <32 x i16> %0, zeroinitializer + %2 = bitcast i32 %__U to <32 x i1> + %3 = and <32 x i1> %1, %2 + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define i64 @test_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_testn_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-8, %esp +; X32-NEXT: subl $8, %esp +; X32-NEXT: vptestnmb %zmm0, %zmm1, %k0 +; X32-NEXT: kmovq %k0, (%esp) +; X32-NEXT: movl (%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_testn_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmb %zmm0, %zmm1, %k0 +; X64-NEXT: kmovq %k0, %rax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> + %1 = icmp eq <64 x i8> %0, zeroinitializer + %2 = bitcast <64 x i1> %1 to i64 + ret i64 %2 +} + +define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_mask_testn_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: pushl %ebx +; X32-NEXT: andl $-64, %esp +; X32-NEXT: subl $256, %esp # imm = 0x100 +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: vmovaps %zmm1, {{[0-9]+}}(%esp) # 64-byte Spill +; X32-NEXT: vmovaps %zmm0, {{[0-9]+}}(%esp) # 64-byte Spill +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: andb $2, %cl +; X32-NEXT: shrb %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vpsllw $8, %xmm0, %xmm0 +; X32-NEXT: kmovd %eax, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vpbroadcastw %xmm1, %xmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vpslld $24, %xmm1, %xmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $4, %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vpbroadcastd %xmm1, %xmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $5, %cl +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vpsllq $40, %xmm1, %xmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $6, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $7, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $56, %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movb %ah, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastq %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: andb $2, %cl +; X32-NEXT: shrb %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movb %ah, %cl +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $12, %ecx +; X32-NEXT: andl $15, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $13, %ecx +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $14, %ecx +; X32-NEXT: andl $3, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $15, %ecx +; X32-NEXT: andl $1, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $16, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $2, %dl +; X32-NEXT: shrb %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllw $8, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $15, %dl +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: shrb $2, %bl +; X32-NEXT: kmovd %ebx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslld $24, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $4, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $5, %dl +; X32-NEXT: andb $1, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $40, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $6, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $7, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $56, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $24, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastq %xmm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $2, %dl +; X32-NEXT: shrb %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; X32-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $28, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $29, %ecx +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; X32-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $30, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrl $31, %eax +; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; X32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT: vpmovm2b %k1, %zmm7 +; X32-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm6, %ymm1, %ymm7, %ymm1 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: andb $2, %cl +; X32-NEXT: shrb %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllw $8, %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslld $24, %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $4, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $5, %cl +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $40, %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $6, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrb $7, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $56, %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movb %ah, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastq %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: andb $2, %cl +; X32-NEXT: shrb %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movb %ah, %cl +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $12, %ecx +; X32-NEXT: andl $15, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $13, %ecx +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $14, %ecx +; X32-NEXT: andl $3, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $15, %ecx +; X32-NEXT: andl $1, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $16, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $2, %dl +; X32-NEXT: shrb %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllw $8, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $15, %dl +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: shrb $2, %bl +; X32-NEXT: kmovd %ebx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $3, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslld $24, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $4, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $5, %dl +; X32-NEXT: andb $1, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $40, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $6, %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: shrb $7, %cl +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpsllq $56, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $24, %ecx +; X32-NEXT: kmovd %ecx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpbroadcastq %xmm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: andb $2, %dl +; X32-NEXT: shrb %dl +; X32-NEXT: kmovd %edx, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: andb $15, %cl +; X32-NEXT: movl %ecx, %edx +; X32-NEXT: shrb $2, %dl +; X32-NEXT: kmovd %edx, %k0 +; X32-NEXT: vpmovb2m %zmm0, %k1 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vpbroadcastw %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vpmovm2b %k1, %zmm1 +; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: shrb $3, %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm1 +; X32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm6 +; X32-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $28, %ecx +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm5 +; X32-NEXT: vpbroadcastd %xmm5, %xmm5 +; X32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm5, %ymm1 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $29, %ecx +; X32-NEXT: andb $1, %cl +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm4 +; X32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2] +; X32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; X32-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $30, %ecx +; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vpmovm2b %k0, %zmm3 +; X32-NEXT: vpbroadcastw %xmm3, %xmm3 +; X32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k0 +; X32-NEXT: kshiftlq $1, %k0, %k0 +; X32-NEXT: kshiftrq $1, %k0, %k0 +; X32-NEXT: shrl $31, %eax +; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kshiftlq $63, %k1, %k1 +; X32-NEXT: korq %k1, %k0, %k1 +; X32-NEXT: vmovdqa64 {{[0-9]+}}(%esp), %zmm0 # 64-byte Reload +; X32-NEXT: vmovdqa64 {{[0-9]+}}(%esp), %zmm1 # 64-byte Reload +; X32-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1} +; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %ebx +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_testn_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovq %rdi, %k1 +; X64-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: kmovq %k0, %rax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> + %1 = icmp eq <64 x i8> %0, zeroinitializer + %2 = bitcast i64 %__U to <64 x i1> + %3 = and <64 x i1> %1, %2 + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define i32 @test_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_testn_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestnmw %zmm0, %zmm1, %k0 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_testn_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmw %zmm0, %zmm1, %k0 +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> + %1 = icmp eq <32 x i16> %0, zeroinitializer + %2 = bitcast <32 x i1> %1 to i32 + ret i32 %2 +} + +define i32 @test_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) { +; X32-LABEL: test_mm512_mask_testn_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_testn_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and1.i.i = and <8 x i64> %__B, %__A + %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> + %1 = icmp eq <32 x i16> %0, zeroinitializer + %2 = bitcast i32 %__U to <32 x i1> + %3 = and <32 x i1> %1, %2 + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + !0 = !{i32 1} Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -2,7 +2,6 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 - declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { @@ -3795,3 +3794,135 @@ ret <64 x i8> %res2 } +declare i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8>, <64 x i8>, i64) + +define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) { +; AVX512BW-LABEL: test_int_x86_avx512_ptestm_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k1 {%k1} +; AVX512BW-NEXT: kmovq %k1, %rcx +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_b_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: subl $20, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 24 +; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0 +; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k1 {%k1} +; AVX512F-32-NEXT: kmovq %k1, (%esp) +; AVX512F-32-NEXT: movl (%esp), %eax +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: addl $20, %esp +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl + %res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) + %res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1) + %res2 = add i64 %res, %res1 + ret i64 %res2 +} + +declare i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16>, <32 x i16>, i32) + +define i32@test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) { +; AVX512BW-LABEL: test_int_x86_avx512_ptestm_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k1 {%k1} +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_w_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k1 {%k1} +; AVX512F-32-NEXT: kmovd %k1, %ecx +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: addl %ecx, %eax +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl + %res = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) + %res1 = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1) + %res2 = add i32 %res, %res1 + ret i32 %res2 +} + +declare i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8>, <64 x i8>, i64 %x2) + +define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) { +; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k1 {%k1} +; AVX512BW-NEXT: kmovq %k1, %rcx +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_b_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: subl $20, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 24 +; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0 +; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k1 {%k1} +; AVX512F-32-NEXT: kmovq %k1, (%esp) +; AVX512F-32-NEXT: movl (%esp), %eax +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: addl $20, %esp +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl + %res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) + %res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1) + %res2 = add i64 %res, %res1 + ret i64 %res2 +} + +declare i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16>, <32 x i16>, i32 %x2) + +define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) { +; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k1 {%k1} +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_w_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k1 {%k1} +; AVX512F-32-NEXT: kmovd %k1, %ecx +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: addl %ecx, %eax +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl + %res = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) + %res1 = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1) + %res2 = add i32 %res, %res1 + ret i32 %res2 +} + Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1683,134 +1683,6 @@ ret <32 x i16> %res4 } -declare i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8>, <64 x i8>, i64) - -define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) { -; AVX512BW-LABEL: test_int_x86_avx512_ptestm_b_512: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: kmovq %rdi, %k1 -; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1} -; AVX512BW-NEXT: kmovq %k0, %rcx -; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0 -; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: addq %rcx, %rax -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_b_512: -; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: subl $20, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 24 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 -; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $20, %esp -; AVX512F-32-NEXT: retl - %res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) - %res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1) - %res2 = add i64 %res, %res1 - ret i64 %res2 -} - -declare i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16>, <32 x i16>, i32) - -define i32@test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) { -; AVX512BW-LABEL: test_int_x86_avx512_ptestm_w_512: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 {%k1} -; AVX512BW-NEXT: kmovd %k0, %ecx -; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: addl %ecx, %eax -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_w_512: -; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovd %k0, %ecx -; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovd %k0, %eax -; AVX512F-32-NEXT: addl %ecx, %eax -; AVX512F-32-NEXT: retl - %res = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) - %res1 = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1) - %res2 = add i32 %res, %res1 - ret i32 %res2 -} - -declare i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8>, <64 x i8>, i64 %x2) - -define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) { -; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_b_512: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: kmovq %rdi, %k1 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1} -; AVX512BW-NEXT: kmovq %k0, %rcx -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 -; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: addq %rcx, %rax -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_b_512: -; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: subl $20, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 24 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 -; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $20, %esp -; AVX512F-32-NEXT: retl - %res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) - %res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1) - %res2 = add i64 %res, %res1 - ret i64 %res2 -} - -declare i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16>, <32 x i16>, i32 %x2) - -define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) { -; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_w_512: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 {%k1} -; AVX512BW-NEXT: kmovd %k0, %ecx -; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: addl %ecx, %eax -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_w_512: -; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovd %k0, %ecx -; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovd %k0, %eax -; AVX512F-32-NEXT: addl %ecx, %eax -; AVX512F-32-NEXT: retl - %res = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) - %res1 = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1) - %res2 = add i32 %res, %res1 - ret i32 %res2 -} - define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) { ; AVX512BW-LABEL: test_x86_avx512_psll_w_512: ; AVX512BW: ## BB#0: Index: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll @@ -4,6 +4,400 @@ ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vlbw-builtins.c +define zeroext i16 @test_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_test_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestmb %xmm0, %xmm1, %k0 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_test_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestmb %xmm0, %xmm1, %k0 +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <16 x i8> + %1 = icmp ne <16 x i8> %0, zeroinitializer + %2 = bitcast <16 x i1> %1 to i16 + ret i16 %2 +} + +define zeroext i16 @test_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_mask_test_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vptestmb %xmm0, %xmm1, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_test_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vptestmb %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <16 x i8> + %1 = icmp ne <16 x i8> %0, zeroinitializer + %2 = bitcast i16 %__U to <16 x i1> + %3 = and <16 x i1> %1, %2 + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define i32 @test_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_test_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestmb %ymm0, %ymm1, %k0 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_test_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestmb %ymm0, %ymm1, %k0 +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <32 x i8> + %1 = icmp ne <32 x i8> %0, zeroinitializer + %2 = bitcast <32 x i1> %1 to i32 + ret i32 %2 +} + +define i32 @test_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_mask_test_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vptestmb %ymm0, %ymm1, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_test_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vptestmb %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <32 x i8> + %1 = icmp ne <32 x i8> %0, zeroinitializer + %2 = bitcast i32 %__U to <32 x i1> + %3 = and <32 x i1> %1, %2 + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i8 @test_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_test_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestmw %xmm0, %xmm1, %k0 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_test_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestmw %xmm0, %xmm1, %k0 +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <8 x i16> + %1 = icmp ne <8 x i16> %0, zeroinitializer + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} + +define zeroext i8 @test_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_mask_test_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: vptestmw %xmm0, %xmm1, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_test_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vptestmw %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <8 x i16> + %1 = icmp ne <8 x i16> %0, zeroinitializer + %2 = bitcast i8 %__U to <8 x i1> + %3 = and <8 x i1> %1, %2 + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i16 @test_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_test_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestmw %ymm0, %ymm1, %k0 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_test_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestmw %ymm0, %ymm1, %k0 +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <16 x i16> + %1 = icmp ne <16 x i16> %0, zeroinitializer + %2 = bitcast <16 x i1> %1 to i16 + ret i16 %2 +} + +define zeroext i16 @test_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_mask_test_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vptestmw %ymm0, %ymm1, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_test_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vptestmw %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <16 x i16> + %1 = icmp ne <16 x i16> %0, zeroinitializer + %2 = bitcast i16 %__U to <16 x i1> + %3 = and <16 x i1> %1, %2 + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_testn_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestnmb %xmm0, %xmm1, %k0 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_testn_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmb %xmm0, %xmm1, %k0 +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <16 x i8> + %1 = icmp eq <16 x i8> %0, zeroinitializer + %2 = bitcast <16 x i1> %1 to i16 + ret i16 %2 +} + +define zeroext i16 @test_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_mask_testn_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vptestnmb %xmm0, %xmm1, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_testn_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vptestnmb %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <16 x i8> + %1 = icmp eq <16 x i8> %0, zeroinitializer + %2 = bitcast i16 %__U to <16 x i1> + %3 = and <16 x i1> %1, %2 + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define i32 @test_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_testn_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestnmb %ymm0, %ymm1, %k0 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_testn_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmb %ymm0, %ymm1, %k0 +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <32 x i8> + %1 = icmp eq <32 x i8> %0, zeroinitializer + %2 = bitcast <32 x i1> %1 to i32 + ret i32 %2 +} + +define i32 @test_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_mask_testn_epi8_mask: +; X32: # BB#0: # %entry +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vptestnmb %ymm0, %ymm1, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_testn_epi8_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vptestnmb %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <32 x i8> + %1 = icmp eq <32 x i8> %0, zeroinitializer + %2 = bitcast i32 %__U to <32 x i1> + %3 = and <32 x i1> %1, %2 + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i8 @test_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_testn_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestnmw %xmm0, %xmm1, %k0 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_testn_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmw %xmm0, %xmm1, %k0 +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <8 x i16> + %1 = icmp eq <8 x i16> %0, zeroinitializer + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} + +define zeroext i8 @test_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_mask_testn_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: vptestnmw %xmm0, %xmm1, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_testn_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vptestnmw %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <8 x i16> + %1 = icmp eq <8 x i16> %0, zeroinitializer + %2 = bitcast i8 %__U to <8 x i1> + %3 = and <8 x i1> %1, %2 + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i16 @test_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_testn_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestnmw %ymm0, %ymm1, %k0 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_testn_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmw %ymm0, %ymm1, %k0 +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <16 x i16> + %1 = icmp eq <16 x i16> %0, zeroinitializer + %2 = bitcast <16 x i1> %1 to i16 + ret i16 %2 +} + +define zeroext i16 @test_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_mask_testn_epi16_mask: +; X32: # BB#0: # %entry +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vptestnmw %ymm0, %ymm1, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_testn_epi16_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vptestnmw %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <16 x i16> + %1 = icmp eq <16 x i16> %0, zeroinitializer + %2 = bitcast i16 %__U to <16 x i1> + %3 = and <16 x i1> %1, %2 + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + define <2 x i64> @test_mm_mask_set1_epi8(<2 x i64> %__O, i16 zeroext %__M, i8 signext %__A) local_unnamed_addr #0 { ; X32-LABEL: test_mm_mask_set1_epi8: ; X32: # BB#0: # %entry Index: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -3672,3 +3672,157 @@ declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +declare i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8>, <16 x i8>, i16) + +define i16@test_int_x86_avx512_ptestm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_b_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x26,0xc9] +; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] +; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) + %res1 = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + +declare i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8>, <32 x i8>, i32) + +define i32@test_int_x86_avx512_ptestm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_b_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc1] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x26,0xc9] +; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) + %res1 = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1) + %res2 = add i32 %res, %res1 + ret i32 %res2 +} + +declare i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16>, <8 x i16>, i8) + +define i8@test_int_x86_avx512_ptestm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_w_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc1] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x26,0xc9] +; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16>, <16 x i16>, i16) + +define i16@test_int_x86_avx512_ptestm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_w_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x26,0xc9] +; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] +; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) + %res1 = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + +declare i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8>, <16 x i8>, i16) + +define i16@test_int_x86_avx512_ptestnm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x26,0xc9] +; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] +; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) + %res1 = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + +declare i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8>, <32 x i8>, i32) + +define i32@test_int_x86_avx512_ptestnm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x26,0xc1] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x26,0xc9] +; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) + %res1 = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1) + %res2 = add i32 %res, %res1 + ret i32 %res2 +} + +declare i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16>, <8 x i16>, i8 %x2) + +define i8@test_int_x86_avx512_ptestnm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x26,0xc1] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x26,0xc9] +; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16>, <16 x i16>, i16 %x2) + +define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x26,0xc9] +; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] +; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) + %res1 = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + Index: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -2515,154 +2515,3 @@ ret <16 x i16> %res4 } -declare i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8>, <16 x i8>, i16) - -define i16@test_int_x86_avx512_ptestm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestm_b_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] -; CHECK-NEXT: ## kill: %AX %AX %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) - %res1 = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1) - %res2 = add i16 %res, %res1 - ret i16 %res2 -} - -declare i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8>, <32 x i8>, i32) - -define i32@test_int_x86_avx512_ptestm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestm_b_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) - %res1 = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1) - %res2 = add i32 %res, %res1 - ret i32 %res2 -} - -declare i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16>, <8 x i16>, i8) - -define i8@test_int_x86_avx512_ptestm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestm_w_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - -declare i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16>, <16 x i16>, i16) - -define i16@test_int_x86_avx512_ptestm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestm_w_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] -; CHECK-NEXT: ## kill: %AX %AX %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) - %res1 = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1) - %res2 = add i16 %res, %res1 - ret i16 %res2 -} - -declare i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8>, <16 x i8>, i16) - -define i16@test_int_x86_avx512_ptestnm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] -; CHECK-NEXT: ## kill: %AX %AX %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) - %res1 = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1) - %res2 = add i16 %res, %res1 - ret i16 %res2 -} - -declare i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8>, <32 x i8>, i32) - -define i32@test_int_x86_avx512_ptestnm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) - %res1 = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1) - %res2 = add i32 %res, %res1 - ret i32 %res2 -} - -declare i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16>, <8 x i16>, i8 %x2) - -define i8@test_int_x86_avx512_ptestnm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - -declare i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16>, <16 x i16>, i16 %x2) - -define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x26,0xc1] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] -; CHECK-NEXT: ## kill: %AX %AX %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) - %res1 = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1) - %res2 = add i16 %res, %res1 - ret i16 %res2 -} - - Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -233,6 +233,424 @@ ret <4 x i64> %1 } +define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_test_epi32_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestmd %xmm0, %xmm1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_test_epi32_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestmd %xmm0, %xmm1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <4 x i32> + %1 = icmp ne <4 x i32> %0, zeroinitializer + %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> + %3 = bitcast <8 x i1> %2 to i8 + ret i8 %3 +} + +define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_mask_test_epi32_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_test_epi32_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <4 x i32> + %1 = icmp ne <4 x i32> %0, zeroinitializer + %2 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + %3 = and <4 x i1> %1, %extract.i + %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> + %5 = bitcast <8 x i1> %4 to i8 + ret i8 %5 +} + +define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_test_epi32_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestmd %ymm0, %ymm1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_test_epi32_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestmd %ymm0, %ymm1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <8 x i32> + %1 = icmp ne <8 x i32> %0, zeroinitializer + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} + +define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_mask_test_epi32_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_test_epi32_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <8 x i32> + %1 = icmp ne <8 x i32> %0, zeroinitializer + %2 = bitcast i8 %__U to <8 x i1> + %3 = and <8 x i1> %1, %2 + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_test_epi64_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestmq %xmm0, %xmm1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_test_epi64_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestmq %xmm0, %xmm1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer + %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} + +define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_mask_test_epi64_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_test_epi64_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = and <2 x i1> %0, %extract.i + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_test_epi64_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestmq %ymm0, %ymm1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_test_epi64_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestmq %ymm0, %ymm1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} + +define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_mask_test_epi64_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_test_epi64_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = and <4 x i1> %0, %extract.i + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_testn_epi32_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestnmd %xmm0, %xmm1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_testn_epi32_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <4 x i32> + %1 = icmp eq <4 x i32> %0, zeroinitializer + %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> + %3 = bitcast <8 x i1> %2 to i8 + ret i8 %3 +} + +define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_mask_testn_epi32_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_testn_epi32_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = bitcast <2 x i64> %and.i.i to <4 x i32> + %1 = icmp eq <4 x i32> %0, zeroinitializer + %2 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + %3 = and <4 x i1> %1, %extract.i + %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> + %5 = bitcast <8 x i1> %4 to i8 + ret i8 %5 +} + +define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_testn_epi32_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestnmd %ymm0, %ymm1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_testn_epi32_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <8 x i32> + %1 = icmp eq <8 x i32> %0, zeroinitializer + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} + +define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_mask_testn_epi32_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_testn_epi32_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = bitcast <4 x i64> %and.i.i to <8 x i32> + %1 = icmp eq <8 x i32> %0, zeroinitializer + %2 = bitcast i8 %__U to <8 x i1> + %3 = and <8 x i1> %1, %2 + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_testn_epi64_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestnmq %xmm0, %xmm1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_testn_epi64_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer + %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} + +define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { +; X32-LABEL: test_mm_mask_testn_epi64_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_testn_epi64_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %and.i.i = and <2 x i64> %__B, %__A + %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = and <2 x i1> %0, %extract.i + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_testn_epi64_mask: +; X32: # BB#0: # %entry +; X32-NEXT: vptestnmq %ymm0, %ymm1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_testn_epi64_mask: +; X64: # BB#0: # %entry +; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} + +define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { +; X32-LABEL: test_mm256_mask_testn_epi64_mask: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_testn_epi64_mask: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %and.i.i = and <4 x i64> %__B, %__A + %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = and <4 x i1> %0, %extract.i + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M) { ; X32-LABEL: test_mm_mask_set1_epi32: ; X32: # BB#0: # %entry Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -5992,3 +5992,155 @@ ret <8 x i32> %res2 } +declare i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32>, <4 x i32>,i8) + +define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_d_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x27,0xc9] +; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32>, <8 x i32>, i8) + +define i8@test_int_x86_avx512_ptestm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc1] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x27,0xc9] +; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64>, <2 x i64>, i8) + +define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_q_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x27,0xc9] +; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64>, <4 x i64>, i8) + +define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_q_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x27,0xc9] +; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32>, <4 x i32>, i8 %x2) + +define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x27,0xc9] +; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32>, <8 x i32>, i8 %x2) + +define i8@test_int_x86_avx512_ptestnm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x27,0xc1] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x27,0xc9] +; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64>, <2 x i64>, i8 %x2) + +define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x27,0xc9] +; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64>, <4 x i64>, i8 %x2) + +define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x27,0xc9] +; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3859,160 +3859,6 @@ ret <8 x float> %res4 } -declare i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32>, <4 x i32>,i8) - -define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestm_d_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - -declare i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32>, <8 x i32>, i8) - -define i8@test_int_x86_avx512_ptestm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestm_d_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - -declare i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64>, <2 x i64>, i8) - -define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestm_q_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - -declare i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64>, <4 x i64>, i8) - -define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestm_q_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - -declare i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32>, <4 x i32>, i8 %x2) - -define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - -declare i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32>, <8 x i32>, i8 %x2) - -define i8@test_int_x86_avx512_ptestnm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - -declare i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64>, <2 x i64>, i8 %x2) - -define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - -declare i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64>, <4 x i64>, i8 %x2) - -define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1] -; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - - - define <2 x i64> @test_x86_avx512_psra_q_128(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: test_x86_avx512_psra_q_128: ; CHECK: ## BB#0: