Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -2457,6 +2457,18 @@ NewInf.KRC)>; def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and _.KRCWM:$mask, + (X86cmpm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rrik) _.KRCWM:$mask, + _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), (_.KVT (X86cmpm (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc)), @@ -2467,6 +2479,19 @@ NewInf.KRC)>; def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and _.KRCWM:$mask, + (X86cmpm (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))), + imm:$cc))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmik) _.KRCWM:$mask, + _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), (_.KVT (X86cmpm (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)), imm:$cc)), @@ -2475,6 +2500,19 @@ addr:$src2, imm:$cc), NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and _.KRCWM:$mask, + (X86cmpm (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + imm:$cc))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmbik) _.KRCWM:$mask, + _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; } } @@ -2493,6 +2531,19 @@ _.RC:$src2, imm:$cc), NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and _.KRCWM:$mask, + (X86cmpmRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rribk) _.KRCWM:$mask, + _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; } Index: test/CodeGen/X86/avx512vl-vec-masked-cmp.ll =================================================================== --- test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -1,7 +1,9 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=NoVLX +; NoVLX checks were generated for few relevant functions only (test cases) by the update_llc_test_checks tools. +; Please update the test accordingly. + define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask: ; CHECK: ## BB#0: ## %entry @@ -12349,6 +12351,67 @@ ret i8 %4 } +define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask: @@ -12398,6 +12461,67 @@ ret i16 %4 } +define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: @@ -12447,6 +12571,67 @@ ret i32 %4 } +define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: @@ -12496,6 +12681,67 @@ ret i64 %4 } +define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask: @@ -12578,6 +12824,67 @@ ret i16 %4 } +define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %1 = bitcast <4 x i64> %__b to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load float, float* %__b + %vec = insertelement <8 x float> undef, float %load, i32 0 + %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: @@ -12630,28 +12937,31 @@ ret i32 %4 } - -define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: +define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %1 = bitcast <4 x i64> %__b to <8 x float> %2 = fcmp oeq <8 x float> %0, %1 - %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> - %4 = bitcast <64 x i1> %3 to i64 - ret i64 %4 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 } -define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: +define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -12659,16 +12969,19 @@ %load = load <4 x i64>, <4 x i64>* %__b %1 = bitcast <4 x i64> %load to <8 x float> %2 = fcmp oeq <8 x float> %0, %1 - %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> - %4 = bitcast <64 x i1> %3 to i64 - ret i64 %4 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 } -define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: +define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -12677,12 +12990,128 @@ %vec = insertelement <8 x float> undef, float %load, i32 0 %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %2 = fcmp oeq <8 x float> %0, %1 - %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> - %4 = bitcast <64 x i1> %3 to i64 - ret i64 %4 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 } + +define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %1 = bitcast <4 x i64> %__b to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load float, float* %__b + %vec = insertelement <8 x float> undef, float %load, i32 0 + %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %1 = bitcast <4 x i64> %__b to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load float, float* %__b + %vec = insertelement <8 x float> undef, float %load, i32 0 + %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + + define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: ; CHECK: ## BB#0: ## %entry @@ -12734,6 +13163,67 @@ ret i32 %4 } +define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load float, float* %__b + %vec = insertelement <16 x float> undef, float %load, i32 0 + %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: @@ -12750,6 +13240,23 @@ ret i32 %3 } +define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8) + %3 = zext i16 %2 to i32 + ret i32 %3 +} + + define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: @@ -12802,6 +13309,67 @@ ret i64 %4 } +define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load float, float* %__b + %vec = insertelement <16 x float> undef, float %load, i32 0 + %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask: @@ -12819,6 +13387,24 @@ ret i64 %3 } +define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8) + %3 = zext i16 %2 to i64 + ret i64 %3 +} + + declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { @@ -12872,6 +13458,70 @@ ret i4 %4 } +define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + +define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + +define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask: @@ -12921,6 +13571,67 @@ ret i8 %4 } +define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask: @@ -12970,6 +13681,67 @@ ret i16 %4 } +define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: @@ -13019,6 +13791,67 @@ ret i32 %4 } +define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: @@ -13068,6 +13901,67 @@ ret i64 %4 } +define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = bitcast i2 %__u to <2 x i1> + %4 = and <2 x i1> %2, %3 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask: @@ -13120,6 +14014,70 @@ ret i8 %4 } +define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask: @@ -13172,6 +14130,70 @@ ret i16 %4 } +define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: @@ -13192,7 +14214,64 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -13201,15 +14280,19 @@ %load = load <4 x i64>, <4 x i64>* %__b %1 = bitcast <4 x i64> %load to <4 x double> %2 = fcmp oeq <4 x double> %0, %1 - %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> - %4 = bitcast <32 x i1> %3 to i32 - ret i32 %4 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 } -define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: +define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -13219,12 +14302,15 @@ %vec = insertelement <4 x double> undef, double %load, i32 0 %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %2 = fcmp oeq <4 x double> %0, %1 - %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> - %4 = bitcast <32 x i1> %3 to i32 - ret i32 %4 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 } + define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: ; CHECK: ## BB#0: ## %entry @@ -13276,6 +14362,70 @@ ret i64 %4 } +define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = bitcast i4 %__u to <4 x i1> + %4 = and <4 x i1> %2, %3 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask: @@ -13328,6 +14478,67 @@ ret i16 %4 } +define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load double, double* %__b + %vec = insertelement <8 x double> undef, double %load, i32 0 + %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask: @@ -13344,6 +14555,23 @@ ret i16 %3 } +define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8) + %3 = zext i8 %2 to i16 + ret i16 %3 +} + + define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: @@ -13396,6 +14624,67 @@ ret i32 %4 } +define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load double, double* %__b + %vec = insertelement <8 x double> undef, double %load, i32 0 + %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask: @@ -13412,6 +14701,23 @@ ret i32 %3 } +define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8) + %3 = zext i8 %2 to i32 + ret i32 %3 +} + + define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: @@ -13464,6 +14770,67 @@ ret i64 %4 } +define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load double, double* %__b + %vec = insertelement <8 x double> undef, double %load, i32 0 + %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask: @@ -13481,5 +14848,22 @@ ret i64 %3 } +define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8) + %3 = zext i8 %2 to i64 + ret i64 %3 +} +