Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW @@ -143,46 +143,29 @@ } define <4 x float> @slto4f32_mem(<4 x i64>* %a) { -; KNL-LABEL: slto4f32_mem: -; KNL: # BB#0: -; KNL-NEXT: vmovdqu (%rdi), %ymm0 -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; KNL-NEXT: retq +; NODQ-LABEL: slto4f32_mem: +; NODQ: # BB#0: +; NODQ-NEXT: vmovdqu (%rdi), %ymm0 +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; NODQ-NEXT: vzeroupper +; NODQ-NEXT: retq ; ; VLDQ-LABEL: slto4f32_mem: ; VLDQ: # BB#0: ; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 ; VLDQ-NEXT: retq ; -; VLNODQ-LABEL: slto4f32_mem: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vmovdqu (%rdi), %ymm0 -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; VLNODQ-NEXT: vzeroupper -; VLNODQ-NEXT: retq -; ; AVX512DQ-LABEL: slto4f32_mem: ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vmovups (%rdi), %ymm0 @@ -190,24 +173,6 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: slto4f32_mem: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a1 = load <4 x i64>, <4 x i64>* %a, align 8 %b = sitofp <4 x i64> %a1 to <4 x float> ret <4 x float>%b @@ -282,21 +247,22 @@ } define <4 x float> @slto4f32(<4 x i64> %a) { -; KNL-LABEL: slto4f32: -; KNL: # BB#0: -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; KNL-NEXT: retq +; NODQ-LABEL: slto4f32: +; NODQ: # BB#0: +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; NODQ-NEXT: vzeroupper +; NODQ-NEXT: retq ; ; VLDQ-LABEL: slto4f32: ; VLDQ: # BB#0: @@ -304,23 +270,6 @@ ; VLDQ-NEXT: vzeroupper ; VLDQ-NEXT: retq ; -; VLNODQ-LABEL: slto4f32: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; VLNODQ-NEXT: vzeroupper -; VLNODQ-NEXT: retq -; ; AVX512DQ-LABEL: slto4f32: ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 @@ -328,43 +277,27 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: slto4f32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %b = sitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } define <4 x float> @ulto4f32(<4 x i64> %a) { -; KNL-LABEL: ulto4f32: -; KNL: # BB#0: -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; KNL-NEXT: retq +; NODQ-LABEL: ulto4f32: +; NODQ: # BB#0: +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; NODQ-NEXT: vzeroupper +; NODQ-NEXT: retq ; ; VLDQ-LABEL: ulto4f32: ; VLDQ: # BB#0: @@ -372,23 +305,6 @@ ; VLDQ-NEXT: vzeroupper ; VLDQ-NEXT: retq ; -; VLNODQ-LABEL: ulto4f32: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; VLNODQ-NEXT: vzeroupper -; VLNODQ-NEXT: retq -; ; AVX512DQ-LABEL: ulto4f32: ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 @@ -396,23 +312,6 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: ulto4f32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %b = uitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } @@ -457,179 +356,67 @@ } define <16 x double> @ulto16f64(<16 x i64> %a) { -; KNL-LABEL: ulto16f64: -; KNL: # BB#0: -; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; KNL-NEXT: vpextrq $1, %xmm2, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 -; KNL-NEXT: vmovq %xmm2, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 -; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; KNL-NEXT: vpextrq $1, %xmm4, %rax -; KNL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; KNL-NEXT: vmovq %xmm4, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm3, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovq %xmm3, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0 -; KNL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; KNL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm3, %rax -; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2 -; KNL-NEXT: vmovq %xmm3, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; KNL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; KNL-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm3, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovq %xmm3, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm3, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovq %xmm3, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovq %xmm1, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1 -; KNL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; KNL-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 -; KNL-NEXT: retq +; NODQ-LABEL: ulto16f64: +; NODQ: # BB#0: +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 +; NODQ-NEXT: retq ; ; DQ-LABEL: ulto16f64: ; DQ: # BB#0: ; DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; DQ-NEXT: vcvtuqq2pd %zmm1, %zmm1 ; DQ-NEXT: retq -; -; VLNODQ-LABEL: ulto16f64: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 -; VLNODQ-NEXT: vmovq %xmm2, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; VLNODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4 -; VLNODQ-NEXT: vmovq %xmm3, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 -; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; VLNODQ-NEXT: vmovq %xmm3, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; VLNODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; VLNODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; VLNODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; VLNODQ-NEXT: vmovq %xmm2, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; VLNODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; VLNODQ-NEXT: vmovq %xmm3, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; VLNODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 -; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; VLNODQ-NEXT: vmovq %xmm3, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; VLNODQ-NEXT: vmovq %xmm1, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; VLNODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; VLNODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 -; VLNODQ-NEXT: retq -; -; AVX512BW-LABEL: ulto16f64: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 -; AVX512BW-NEXT: vmovq %xmm2, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4 -; AVX512BW-NEXT: vmovq %xmm3, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512BW-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; AVX512BW-NEXT: vmovq %xmm3, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX512BW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX512BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; AVX512BW-NEXT: vmovq %xmm2, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; AVX512BW-NEXT: vmovq %xmm3, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512BW-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; AVX512BW-NEXT: vmovq %xmm3, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; AVX512BW-NEXT: vmovq %xmm1, %rax -; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; AVX512BW-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX512BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-NEXT: retq %b = uitofp <16 x i64> %a to <16 x double> ret <16 x double> %b } @@ -653,18 +440,12 @@ } define <16 x i8> @f32to16uc(<16 x float> %f) { -; KNL-LABEL: f32to16uc: -; KNL: # BB#0: -; KNL-NEXT: vcvttps2udq %zmm0, %zmm0 -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: retq -; -; AVX512-LABEL: f32to16uc: -; AVX512: # BB#0: -; AVX512-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; ALL-LABEL: f32to16uc: +; ALL: # BB#0: +; ALL-NEXT: vcvttps2udq %zmm0, %zmm0 +; ALL-NEXT: vpmovdb %zmm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq %res = fptoui <16 x float> %f to <16 x i8> ret <16 x i8> %res } @@ -696,33 +477,18 @@ } define <4 x i32> @f32to4ui(<4 x float> %a) nounwind { -; KNL-LABEL: f32to4ui: -; KNL: # BB#0: -; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: vcvttps2udq %zmm0, %zmm0 -; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: retq +; NOVL-LABEL: f32to4ui: +; NOVL: # BB#0: +; NOVL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 +; NOVL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq ; ; VL-LABEL: f32to4ui: ; VL: # BB#0: ; VL-NEXT: vcvttps2udq %xmm0, %xmm0 ; VL-NEXT: retq -; -; AVX512DQ-LABEL: f32to4ui: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: f32to4ui: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; AVX512BW-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %b = fptoui <4 x float> %a to <4 x i32> ret <4 x i32> %b } @@ -737,12 +503,13 @@ } define <8 x i16> @f64to8us(<8 x double> %f) { -; KNL-LABEL: f64to8us: -; KNL: # BB#0: -; KNL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL-NEXT: retq +; NOVL-LABEL: f64to8us: +; NOVL: # BB#0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vpmovdw %zmm0, %ymm0 +; NOVL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq ; ; VL-LABEL: f64to8us: ; VL: # BB#0: @@ -750,33 +517,18 @@ ; VL-NEXT: vpmovdw %ymm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq -; -; AVX512DQ-LABEL: f64to8us: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: f64to8us: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %res = fptoui <8 x double> %f to <8 x i16> ret <8 x i16> %res } define <8 x i8> @f64to8uc(<8 x double> %f) { -; KNL-LABEL: f64to8uc: -; KNL: # BB#0: -; KNL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL-NEXT: retq +; NOVL-LABEL: f64to8uc: +; NOVL: # BB#0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vpmovdw %zmm0, %ymm0 +; NOVL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq ; ; VL-LABEL: f64to8uc: ; VL: # BB#0: @@ -784,55 +536,24 @@ ; VL-NEXT: vpmovdw %ymm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq -; -; AVX512DQ-LABEL: f64to8uc: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: f64to8uc: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %res = fptoui <8 x double> %f to <8 x i8> ret <8 x i8> %res } define <4 x i32> @f64to4ui(<4 x double> %a) nounwind { -; KNL-LABEL: f64to4ui: -; KNL: # BB#0: -; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; KNL-NEXT: vcvttpd2udq %zmm0, %ymm0 -; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL-NEXT: retq +; NOVL-LABEL: f64to4ui: +; NOVL: # BB#0: +; NOVL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0 +; NOVL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq ; ; VL-LABEL: f64to4ui: ; VL: # BB#0: ; VL-NEXT: vcvttpd2udq %ymm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq -; -; AVX512DQ-LABEL: f64to4ui: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: f64to4ui: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; AVX512BW-NEXT: vcvttpd2udq %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %b = fptoui <4 x double> %a to <4 x i32> ret <4 x i32> %b } @@ -926,16 +647,11 @@ } define <4 x i32> @f64to4si(<4 x double> %a) { -; KNL-LABEL: f64to4si: -; KNL: # BB#0: -; KNL-NEXT: vcvttpd2dq %ymm0, %xmm0 -; KNL-NEXT: retq -; -; AVX512-LABEL: f64to4si: -; AVX512: # BB#0: -; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; ALL-LABEL: f64to4si: +; ALL: # BB#0: +; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq %b = fptosi <4 x double> %a to <4 x i32> ret <4 x i32> %b } @@ -952,28 +668,24 @@ } define <4 x float> @f64to4f32(<4 x double> %b) { -; KNL-LABEL: f64to4f32: -; KNL: # BB#0: -; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; KNL-NEXT: retq -; -; AVX512-LABEL: f64to4f32: -; AVX512: # BB#0: -; AVX512-NEXT: vcvtpd2ps %ymm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; ALL-LABEL: f64to4f32: +; ALL: # BB#0: +; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq %a = fptrunc <4 x double> %b to <4 x float> ret <4 x float> %a } define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) { -; KNL-LABEL: f64to4f32_mask: -; KNL: # BB#0: -; KNL-NEXT: vpslld $31, %xmm1, %xmm1 -; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 -; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; KNL-NEXT: vpand %xmm0, %xmm1, %xmm0 -; KNL-NEXT: retq +; NOVL-LABEL: f64to4f32_mask: +; NOVL: # BB#0: +; NOVL-NEXT: vpslld $31, %xmm1, %xmm1 +; NOVL-NEXT: vpsrad $31, %xmm1, %xmm1 +; NOVL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; NOVL-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq ; ; VL-LABEL: f64to4f32_mask: ; VL: # BB#0: @@ -982,24 +694,6 @@ ; VL-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} ; VL-NEXT: vzeroupper ; VL-NEXT: retq -; -; AVX512DQ-LABEL: f64to4f32_mask: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX512DQ-NEXT: vcvtpd2ps %ymm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: f64to4f32_mask: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX512BW-NEXT: vcvtpd2ps %ymm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a = fptrunc <4 x double> %b to <4 x float> %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer ret <4 x float> %c @@ -1180,100 +874,40 @@ } define <8 x float> @slto8f32(<8 x i64> %a) { -; KNL-LABEL: slto8f32: -; KNL: # BB#0: -; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; KNL-NEXT: vmovq %xmm1, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; KNL-NEXT: vmovq %xmm2, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm2, %rax -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; KNL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; KNL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm3[0],xmm2[3] -; KNL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] -; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; KNL-NEXT: retq +; NODQ-LABEL: slto8f32: +; NODQ: # BB#0: +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq ; ; DQ-LABEL: slto8f32: ; DQ: # BB#0: ; DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; DQ-NEXT: retq -; -; VLNODQ-LABEL: slto8f32: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; VLNODQ-NEXT: vmovq %xmm1, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; VLNODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; VLNODQ-NEXT: vmovq %xmm2, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; VLNODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; VLNODQ-NEXT: retq -; -; AVX512BW-LABEL: slto8f32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovq %xmm1, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: retq %b = sitofp <8 x i64> %a to <8 x float> ret <8 x float> %b } @@ -1386,278 +1020,106 @@ } define <16 x double> @slto16f64(<16 x i64> %a) { -; KNL-LABEL: slto16f64: -; KNL: # BB#0: -; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; KNL-NEXT: vpextrq $1, %xmm2, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 -; KNL-NEXT: vmovq %xmm2, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 -; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; KNL-NEXT: vpextrq $1, %xmm4, %rax -; KNL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; KNL-NEXT: vmovq %xmm4, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm3, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovq %xmm3, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0 -; KNL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; KNL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm3, %rax -; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2 -; KNL-NEXT: vmovq %xmm3, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; KNL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; KNL-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm3, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovq %xmm3, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm3, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovq %xmm3, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; KNL-NEXT: vmovq %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1 -; KNL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; KNL-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 -; KNL-NEXT: retq +; NODQ-LABEL: slto16f64: +; NODQ: # BB#0: +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 +; NODQ-NEXT: retq ; ; DQ-LABEL: slto16f64: ; DQ: # BB#0: ; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; DQ-NEXT: vcvtqq2pd %zmm1, %zmm1 ; DQ-NEXT: retq -; -; VLNODQ-LABEL: slto16f64: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 -; VLNODQ-NEXT: vmovq %xmm2, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; VLNODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm4 -; VLNODQ-NEXT: vmovq %xmm3, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 -; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; VLNODQ-NEXT: vmovq %xmm3, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; VLNODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; VLNODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; VLNODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; VLNODQ-NEXT: vmovq %xmm2, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; VLNODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; VLNODQ-NEXT: vmovq %xmm3, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; VLNODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 -; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; VLNODQ-NEXT: vmovq %xmm3, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; VLNODQ-NEXT: vmovq %xmm1, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; VLNODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; VLNODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 -; VLNODQ-NEXT: retq -; -; AVX512BW-LABEL: slto16f64: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 -; AVX512BW-NEXT: vmovq %xmm2, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm4 -; AVX512BW-NEXT: vmovq %xmm3, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512BW-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; AVX512BW-NEXT: vmovq %xmm3, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX512BW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX512BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; AVX512BW-NEXT: vmovq %xmm2, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; AVX512BW-NEXT: vmovq %xmm3, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512BW-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; AVX512BW-NEXT: vmovq %xmm3, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; AVX512BW-NEXT: vmovq %xmm1, %rax -; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1 -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; AVX512BW-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX512BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-NEXT: retq %b = sitofp <16 x i64> %a to <16 x double> ret <16 x double> %b } define <8 x float> @ulto8f32(<8 x i64> %a) { -; KNL-LABEL: ulto8f32: -; KNL: # BB#0: -; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; KNL-NEXT: vmovq %xmm1, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; KNL-NEXT: vmovq %xmm2, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm2, %rax -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; KNL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; KNL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm3[0],xmm2[3] -; KNL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] -; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; KNL-NEXT: retq +; NODQ-LABEL: ulto8f32: +; NODQ: # BB#0: +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq ; ; DQ-LABEL: ulto8f32: ; DQ: # BB#0: ; DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; DQ-NEXT: retq -; -; VLNODQ-LABEL: ulto8f32: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; VLNODQ-NEXT: vmovq %xmm1, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; VLNODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; VLNODQ-NEXT: vmovq %xmm2, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; VLNODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; VLNODQ-NEXT: retq -; -; AVX512BW-LABEL: ulto8f32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovq %xmm1, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: retq %b = uitofp <8 x i64> %a to <8 x float> ret <8 x float> %b } @@ -1852,33 +1314,18 @@ } define <4 x float> @uito4f32(<4 x i32> %a) nounwind { -; KNL-LABEL: uito4f32: -; KNL: # BB#0: -; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0 -; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: retq +; NOVL-LABEL: uito4f32: +; NOVL: # BB#0: +; NOVL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; NOVL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq ; ; VL-LABEL: uito4f32: ; VL: # BB#0: ; VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; VL-NEXT: retq -; -; AVX512DQ-LABEL: uito4f32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: uito4f32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; AVX512BW-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %b = uitofp <4 x i32> %a to <4 x float> ret <4 x float> %b } @@ -2510,19 +1957,19 @@ } define <2 x float> @ubto2f32(<2 x i32> %a) { -; KNL-LABEL: ubto2f32: -; KNL: # BB#0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpextrb $8, %xmm0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: vpextrb $0, %xmm0, %ecx -; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: vcvtsi2ssl %ecx, %xmm2, %xmm1 -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; KNL-NEXT: retq +; NOVL-LABEL: ubto2f32: +; NOVL: # BB#0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vpextrb $8, %xmm0, %eax +; NOVL-NEXT: andl $1, %eax +; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 +; NOVL-NEXT: vpextrb $0, %xmm0, %eax +; NOVL-NEXT: andl $1, %eax +; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 +; NOVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; NOVL-NEXT: retq ; ; VL-LABEL: ubto2f32: ; VL: # BB#0: @@ -2532,34 +1979,6 @@ ; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; VL-NEXT: retq -; -; AVX512DQ-LABEL: ubto2f32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512DQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; AVX512DQ-NEXT: vpextrb $8, %xmm0, %eax -; AVX512DQ-NEXT: andl $1, %eax -; AVX512DQ-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 -; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQ-NEXT: andl $1, %eax -; AVX512DQ-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: ubto2f32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BW-NEXT: andl $1, %eax -; AVX512BW-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: andl $1, %eax -; AVX512BW-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX512BW-NEXT: retq %mask = icmp ult <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> ret <2 x float> %1 Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=CHECK --check-prefix=KNL %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=avx512vbmi | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=KNL %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { ; CHECK-LABEL: test1: @@ -12,7 +12,6 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %rrr = load float, float* %br %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1 %rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14 @@ -28,7 +27,6 @@ ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %rrr = load double, double* %br %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1 %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6 @@ -42,7 +40,6 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %eee = extractelement <16 x float> %x, i32 4 %rrr2 = insertelement <16 x float> %x, float %eee, i32 1 ret <16 x float> %rrr2 @@ -56,7 +53,6 @@ ; CHECK-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1 ; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %eee = extractelement <8 x i64> %x, i32 4 %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1 ret <8 x i64> %rrr2 @@ -67,7 +63,6 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vextractps $3, %xmm0, %eax ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %ef = extractelement <4 x float> %x, i32 3 %ei = bitcast float %ef to i32 ret i32 %ei @@ -78,148 +73,83 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vextractps $3, %xmm0, (%rdi) ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %ef = extractelement <4 x float> %x, i32 3 store float %ef, float* %out, align 4 ret void } define float @test7(<16 x float> %x, i32 %ind) nounwind { -; KNL-LABEL: test7: -; KNL: ## BB#0: -; KNL-NEXT: pushq %rbp -; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: andq $-64, %rsp -; KNL-NEXT: subq $128, %rsp -; KNL-NEXT: ## kill: %EDI %EDI %RDI -; KNL-NEXT: vmovaps %zmm0, (%rsp) -; KNL-NEXT: andl $15, %edi -; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp -; KNL-NEXT: retq -; KNL-NEXT: ## -- End function -; -; SKX-LABEL: test7: -; SKX: ## BB#0: -; SKX-NEXT: pushq %rbp -; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: andq $-64, %rsp -; SKX-NEXT: subq $128, %rsp -; SKX-NEXT: ## kill: %EDI %EDI %RDI -; SKX-NEXT: vmovaps %zmm0, (%rsp) -; SKX-NEXT: andl $15, %edi -; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SKX-NEXT: movq %rbp, %rsp -; SKX-NEXT: popq %rbp -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq -; SKX-NEXT: ## -- End function +; CHECK-LABEL: test7: +; CHECK: ## BB#0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: andq $-64, %rsp +; CHECK-NEXT: subq $128, %rsp +; CHECK-NEXT: ## kill: %EDI %EDI %RDI +; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %e = extractelement <16 x float> %x, i32 %ind ret float %e } define double @test8(<8 x double> %x, i32 %ind) nounwind { -; KNL-LABEL: test8: -; KNL: ## BB#0: -; KNL-NEXT: pushq %rbp -; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: andq $-64, %rsp -; KNL-NEXT: subq $128, %rsp -; KNL-NEXT: ## kill: %EDI %EDI %RDI -; KNL-NEXT: vmovaps %zmm0, (%rsp) -; KNL-NEXT: andl $7, %edi -; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp -; KNL-NEXT: retq -; KNL-NEXT: ## -- End function -; -; SKX-LABEL: test8: -; SKX: ## BB#0: -; SKX-NEXT: pushq %rbp -; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: andq $-64, %rsp -; SKX-NEXT: subq $128, %rsp -; SKX-NEXT: ## kill: %EDI %EDI %RDI -; SKX-NEXT: vmovaps %zmm0, (%rsp) -; SKX-NEXT: andl $7, %edi -; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; SKX-NEXT: movq %rbp, %rsp -; SKX-NEXT: popq %rbp -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq -; SKX-NEXT: ## -- End function +; CHECK-LABEL: test8: +; CHECK: ## BB#0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: andq $-64, %rsp +; CHECK-NEXT: subq $128, %rsp +; CHECK-NEXT: ## kill: %EDI %EDI %RDI +; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: andl $7, %edi +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %e = extractelement <8 x double> %x, i32 %ind ret double %e } define float @test9(<8 x float> %x, i32 %ind) nounwind { -; KNL-LABEL: test9: -; KNL: ## BB#0: -; KNL-NEXT: pushq %rbp -; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: andq $-32, %rsp -; KNL-NEXT: subq $64, %rsp -; KNL-NEXT: ## kill: %EDI %EDI %RDI -; KNL-NEXT: vmovaps %ymm0, (%rsp) -; KNL-NEXT: andl $7, %edi -; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp -; KNL-NEXT: retq -; KNL-NEXT: ## -- End function -; -; SKX-LABEL: test9: -; SKX: ## BB#0: -; SKX-NEXT: pushq %rbp -; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: andq $-32, %rsp -; SKX-NEXT: subq $64, %rsp -; SKX-NEXT: ## kill: %EDI %EDI %RDI -; SKX-NEXT: vmovaps %ymm0, (%rsp) -; SKX-NEXT: andl $7, %edi -; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SKX-NEXT: movq %rbp, %rsp -; SKX-NEXT: popq %rbp -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq -; SKX-NEXT: ## -- End function +; CHECK-LABEL: test9: +; CHECK: ## BB#0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: andq $-32, %rsp +; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: ## kill: %EDI %EDI %RDI +; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: andl $7, %edi +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %e = extractelement <8 x float> %x, i32 %ind ret float %e } define i32 @test10(<16 x i32> %x, i32 %ind) nounwind { -; KNL-LABEL: test10: -; KNL: ## BB#0: -; KNL-NEXT: pushq %rbp -; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: andq $-64, %rsp -; KNL-NEXT: subq $128, %rsp -; KNL-NEXT: ## kill: %EDI %EDI %RDI -; KNL-NEXT: vmovaps %zmm0, (%rsp) -; KNL-NEXT: andl $15, %edi -; KNL-NEXT: movl (%rsp,%rdi,4), %eax -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp -; KNL-NEXT: retq -; KNL-NEXT: ## -- End function -; -; SKX-LABEL: test10: -; SKX: ## BB#0: -; SKX-NEXT: pushq %rbp -; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: andq $-64, %rsp -; SKX-NEXT: subq $128, %rsp -; SKX-NEXT: ## kill: %EDI %EDI %RDI -; SKX-NEXT: vmovaps %zmm0, (%rsp) -; SKX-NEXT: andl $15, %edi -; SKX-NEXT: movl (%rsp,%rdi,4), %eax -; SKX-NEXT: movq %rbp, %rsp -; SKX-NEXT: popq %rbp -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq -; SKX-NEXT: ## -- End function +; CHECK-LABEL: test10: +; CHECK: ## BB#0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: andq $-64, %rsp +; CHECK-NEXT: subq $128, %rsp +; CHECK-NEXT: ## kill: %EDI %EDI %RDI +; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: movl (%rsp,%rdi,4), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %e = extractelement <16 x i32> %x, i32 %ind ret i32 %e } @@ -274,6 +204,7 @@ ; KNL-NEXT: testb $1, %al ; KNL-NEXT: cmoveq %rsi, %rdi ; KNL-NEXT: movq %rdi, %rax +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test12: @@ -339,6 +270,7 @@ ; KNL-NEXT: testb $1, %al ; KNL-NEXT: cmoveq %rsi, %rdi ; KNL-NEXT: movq %rdi, %rax +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test14: @@ -387,6 +319,7 @@ ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test16: @@ -424,6 +357,7 @@ ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test17: @@ -448,20 +382,13 @@ } define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) { -; KNL-LABEL: extract_v8i64: -; KNL: ## BB#0: -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpextrq $1, %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: extract_v8i64: -; SKX: ## BB#0: -; SKX-NEXT: vpextrq $1, %xmm0, %rax -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; CHECK-LABEL: extract_v8i64: +; CHECK: ## BB#0: +; CHECK-NEXT: vpextrq $1, %xmm0, %rax +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %r1 = extractelement <8 x i64> %x, i32 1 %r2 = extractelement <8 x i64> %x, i32 3 store i64 %r2, i64* %dst, align 1 @@ -469,20 +396,13 @@ } define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) { -; KNL-LABEL: extract_v4i64: -; KNL: ## BB#0: -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpextrq $1, %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: extract_v4i64: -; SKX: ## BB#0: -; SKX-NEXT: vpextrq $1, %xmm0, %rax -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; CHECK-LABEL: extract_v4i64: +; CHECK: ## BB#0: +; CHECK-NEXT: vpextrq $1, %xmm0, %rax +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %r1 = extractelement <4 x i64> %x, i32 1 %r2 = extractelement <4 x i64> %x, i32 3 store i64 %r2, i64* %dst, align 1 @@ -502,20 +422,13 @@ } define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) { -; KNL-LABEL: extract_v16i32: -; KNL: ## BB#0: -; KNL-NEXT: vpextrd $1, %xmm0, %eax -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpextrd $1, %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: extract_v16i32: -; SKX: ## BB#0: -; SKX-NEXT: vpextrd $1, %xmm0, %eax -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrd $1, %xmm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; CHECK-LABEL: extract_v16i32: +; CHECK: ## BB#0: +; CHECK-NEXT: vpextrd $1, %xmm0, %eax +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpextrd $1, %xmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %r1 = extractelement <16 x i32> %x, i32 1 %r2 = extractelement <16 x i32> %x, i32 5 store i32 %r2, i32* %dst, align 1 @@ -523,20 +436,13 @@ } define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) { -; KNL-LABEL: extract_v8i32: -; KNL: ## BB#0: -; KNL-NEXT: vpextrd $1, %xmm0, %eax -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpextrd $1, %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: extract_v8i32: -; SKX: ## BB#0: -; SKX-NEXT: vpextrd $1, %xmm0, %eax -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrd $1, %xmm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; CHECK-LABEL: extract_v8i32: +; CHECK: ## BB#0: +; CHECK-NEXT: vpextrd $1, %xmm0, %eax +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpextrd $1, %xmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %r1 = extractelement <8 x i32> %x, i32 1 %r2 = extractelement <8 x i32> %x, i32 5 store i32 %r2, i32* %dst, align 1 @@ -556,22 +462,14 @@ } define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) { -; KNL-LABEL: extract_v32i16: -; KNL: ## BB#0: -; KNL-NEXT: vpextrw $1, %xmm0, %eax -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpextrw $1, %xmm0, (%rdi) -; KNL-NEXT: ## kill: %AX %AX %EAX -; KNL-NEXT: retq -; -; SKX-LABEL: extract_v32i16: -; SKX: ## BB#0: -; SKX-NEXT: vpextrw $1, %xmm0, %eax -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrw $1, %xmm0, (%rdi) -; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; CHECK-LABEL: extract_v32i16: +; CHECK: ## BB#0: +; CHECK-NEXT: vpextrw $1, %xmm0, %eax +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi) +; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %r1 = extractelement <32 x i16> %x, i32 1 %r2 = extractelement <32 x i16> %x, i32 9 store i16 %r2, i16* %dst, align 1 @@ -579,22 +477,14 @@ } define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) { -; KNL-LABEL: extract_v16i16: -; KNL: ## BB#0: -; KNL-NEXT: vpextrw $1, %xmm0, %eax -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpextrw $1, %xmm0, (%rdi) -; KNL-NEXT: ## kill: %AX %AX %EAX -; KNL-NEXT: retq -; -; SKX-LABEL: extract_v16i16: -; SKX: ## BB#0: -; SKX-NEXT: vpextrw $1, %xmm0, %eax -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrw $1, %xmm0, (%rdi) -; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; CHECK-LABEL: extract_v16i16: +; CHECK: ## BB#0: +; CHECK-NEXT: vpextrw $1, %xmm0, %eax +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi) +; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %r1 = extractelement <16 x i16> %x, i32 1 %r2 = extractelement <16 x i16> %x, i32 9 store i16 %r2, i16* %dst, align 1 @@ -615,22 +505,14 @@ } define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) { -; KNL-LABEL: extract_v64i8: -; KNL: ## BB#0: -; KNL-NEXT: vpextrb $1, %xmm0, %eax -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpextrb $1, %xmm0, (%rdi) -; KNL-NEXT: ## kill: %AL %AL %EAX -; KNL-NEXT: retq -; -; SKX-LABEL: extract_v64i8: -; SKX: ## BB#0: -; SKX-NEXT: vpextrb $1, %xmm0, %eax -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrb $1, %xmm0, (%rdi) -; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; CHECK-LABEL: extract_v64i8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpextrb $1, %xmm0, %eax +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi) +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %r1 = extractelement <64 x i8> %x, i32 1 %r2 = extractelement <64 x i8> %x, i32 17 store i8 %r2, i8* %dst, align 1 @@ -638,22 +520,14 @@ } define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) { -; KNL-LABEL: extract_v32i8: -; KNL: ## BB#0: -; KNL-NEXT: vpextrb $1, %xmm0, %eax -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpextrb $1, %xmm0, (%rdi) -; KNL-NEXT: ## kill: %AL %AL %EAX -; KNL-NEXT: retq -; -; SKX-LABEL: extract_v32i8: -; SKX: ## BB#0: -; SKX-NEXT: vpextrb $1, %xmm0, %eax -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrb $1, %xmm0, (%rdi) -; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; CHECK-LABEL: extract_v32i8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpextrb $1, %xmm0, %eax +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi) +; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %r1 = extractelement <32 x i8> %x, i32 1 %r2 = extractelement <32 x i8> %x, i32 17 store i8 %r2, i8* %dst, align 1 @@ -936,6 +810,7 @@ ; KNL-NEXT: subq $32, %rsp ; KNL-NEXT: xorl %eax, %eax ; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 @@ -1061,7 +936,6 @@ ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; KNL-NEXT: setb %al ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 @@ -1083,6 +957,7 @@ ; KNL-NEXT: movl (%rsp), %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_insertelement_v32i1: @@ -1112,23 +987,23 @@ ; KNL-LABEL: test_iinsertelement_v4i1: ; KNL: ## BB#0: ; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: setb %al ; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpextrb $4, %xmm0, %eax -; KNL-NEXT: setb %cl -; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpextrb $4, %xmm0, %ecx +; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; KNL-NEXT: vpextrb $0, %xmm0, %eax -; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpextrb $0, %xmm0, %ecx +; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] ; KNL-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] ; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 @@ -1144,6 +1019,7 @@ ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_iinsertelement_v4i1: @@ -1188,6 +1064,7 @@ ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_iinsertelement_v2i1: @@ -1308,6 +1185,7 @@ ; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpextrb $2, %xmm0, %eax ; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_v32i1: @@ -1338,6 +1216,7 @@ ; KNL-NEXT: movb $4, %cl ; KNL-NEXT: subb %al, %cl ; KNL-NEXT: movzbl %cl, %eax +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_v64i1: @@ -1370,6 +1249,7 @@ ; KNL-NEXT: movb $4, %cl ; KNL-NEXT: subb %al, %cl ; KNL-NEXT: movzbl %cl, %eax +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: extractelement_v64i1_alt: @@ -1421,6 +1301,7 @@ ; KNL-NEXT: movq (%rsp,%rdi,8), %rax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v4i64: @@ -1466,6 +1347,7 @@ ; KNL-NEXT: movq (%rsp,%rdi,8), %rax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v8i64: @@ -1523,6 +1405,7 @@ ; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v4f64: @@ -1568,6 +1451,7 @@ ; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v8f64: @@ -1625,6 +1509,7 @@ ; KNL-NEXT: movl (%rsp,%rdi,4), %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v8i32: @@ -1670,6 +1555,7 @@ ; KNL-NEXT: movl (%rsp,%rdi,4), %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v16i32: @@ -1727,6 +1613,7 @@ ; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v8f32: @@ -1772,6 +1659,7 @@ ; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v16f32: @@ -1829,6 +1717,7 @@ ; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v16i16: @@ -1875,6 +1764,7 @@ ; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v32i16: @@ -1902,23 +1792,14 @@ } define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) { -; KNL-LABEL: test_extractelement_variable_v16i8: -; KNL: ## BB#0: -; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; KNL-NEXT: ## kill: %EDI %EDI %RDI -; KNL-NEXT: andl $15, %edi -; KNL-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; KNL-NEXT: movb (%rdi,%rax), %al -; KNL-NEXT: retq -; -; SKX-LABEL: test_extractelement_variable_v16i8: -; SKX: ## BB#0: -; SKX-NEXT: ## kill: %EDI %EDI %RDI -; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; SKX-NEXT: andl $15, %edi -; SKX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; SKX-NEXT: movb (%rdi,%rax), %al -; SKX-NEXT: retq +; CHECK-LABEL: test_extractelement_variable_v16i8: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %EDI %EDI %RDI +; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movb (%rdi,%rax), %al +; CHECK-NEXT: retq %t2 = extractelement <16 x i8> %t1, i32 %index ret i8 %t2 } @@ -1936,13 +1817,14 @@ ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $64, %rsp -; KNL-NEXT: vmovaps %ymm0, (%rsp) ; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm0, (%rsp) ; KNL-NEXT: andl $31, %edi ; KNL-NEXT: movq %rsp, %rax ; KNL-NEXT: movb (%rdi,%rax), %al ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v32i8: @@ -1984,14 +1866,15 @@ ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-64, %rsp ; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovaps %ymm0, (%rsp) -; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: andl $63, %edi ; KNL-NEXT: movq %rsp, %rax ; KNL-NEXT: movb (%rdi,%rax), %al ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v64i8: @@ -2042,6 +1925,7 @@ ; KNL-NEXT: movb (%rax,%rcx), %al ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v64i8_indexi8: @@ -2075,12 +1959,12 @@ define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) { ; KNL-LABEL: test_extractelement_varible_v2i1: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) -; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: andl $1, %edi ; KNL-NEXT: movl -24(%rsp,%rdi,8), %eax ; KNL-NEXT: andl $1, %eax @@ -2105,12 +1989,12 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) { ; KNL-LABEL: test_extractelement_varible_v4i1: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) -; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: andl $3, %edi ; KNL-NEXT: movl -24(%rsp,%rdi,4), %eax ; KNL-NEXT: andl $1, %eax @@ -2156,6 +2040,7 @@ ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_varible_v8i1: @@ -2209,6 +2094,7 @@ ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_varible_v16i1: @@ -2265,6 +2151,7 @@ ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_varible_v32i1: Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL -; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX +; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=KNL +; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=SKX ; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW ; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQ @@ -287,7 +287,6 @@ ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AL %AL %EAX ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: shuf_test1: ; SKX: ## BB#0: @@ -296,7 +295,6 @@ ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AL %AL %EAX ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function ; ; AVX512BW-LABEL: shuf_test1: ; AVX512BW: ## BB#0: @@ -305,7 +303,6 @@ ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: ## kill: %AL %AL %EAX ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: ## -- End function ; ; AVX512DQ-LABEL: shuf_test1: ; AVX512DQ: ## BB#0: @@ -314,7 +311,6 @@ ; AVX512DQ-NEXT: kmovw %k0, %eax ; AVX512DQ-NEXT: ## kill: %AL %AL %EAX ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: ## -- End function %v1 = bitcast i16 %v to <16 x i1> %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> %mask1 = bitcast <8 x i1> %mask to i8 @@ -329,6 +325,7 @@ ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: zext_test1: @@ -375,6 +372,7 @@ ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: zext_test2: @@ -424,6 +422,7 @@ ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: andb $1, %al ; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: zext_test3: @@ -516,6 +515,7 @@ ; KNL-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1 ; KNL-NEXT: vpmovqd %zmm1, %ymm1 ; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test4: @@ -611,6 +611,7 @@ ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test7: @@ -661,8 +662,8 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ; KNL-LABEL: test8: ; KNL: ## BB#0: -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: jg LBB17_1 ; KNL-NEXT: ## BB#2: ; KNL-NEXT: vpcmpltud %zmm2, %zmm1, %k1 @@ -672,12 +673,13 @@ ; KNL-NEXT: LBB17_3: ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test8: ; SKX: ## BB#0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: cmpl %esi, %edi +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: jg LBB17_1 ; SKX-NEXT: ## BB#2: ; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0 @@ -743,6 +745,7 @@ ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test9: @@ -883,6 +886,7 @@ ; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test15: @@ -1189,6 +1193,7 @@ ; KNL-NEXT: korw %k1, %k0, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test18: @@ -1289,7 +1294,6 @@ ; KNL-NEXT: vpsraw $15, %ymm2, %ymm2 ; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test21: ; SKX: ## BB#0: @@ -1297,7 +1301,6 @@ ; SKX-NEXT: vpmovb2m %ymm1, %k1 ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function ; ; AVX512BW-LABEL: test21: ; AVX512BW: ## BB#0: @@ -1305,7 +1308,6 @@ ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: ## -- End function ; ; AVX512DQ-LABEL: test21: ; AVX512DQ: ## BB#0: @@ -1319,7 +1321,6 @@ ; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: ## -- End function %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } @@ -1332,6 +1333,7 @@ ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test22: @@ -1371,6 +1373,7 @@ ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test23: @@ -1450,6 +1453,7 @@ ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: store_v2i1: @@ -1494,6 +1498,7 @@ ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: store_v4i1: @@ -1538,6 +1543,7 @@ ; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: store_v8i1: @@ -1580,6 +1586,7 @@ ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: store_v16i1: @@ -1738,9 +1745,11 @@ ; KNL-NEXT: je LBB41_2 ; KNL-NEXT: ## BB#1: ## %L1 ; KNL-NEXT: vmovapd %zmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; KNL-NEXT: LBB41_2: ## %L2 ; KNL-NEXT: vmovapd %zmm0, 8(%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: ktest_1: @@ -1835,73 +1844,9 @@ ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $32, %rsp -; KNL-NEXT: vmovups 64(%rdi), %zmm2 -; KNL-NEXT: vcmpltps %zmm1, %zmm2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: kshiftlw $15, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: vmovd %ecx, %xmm2 -; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $13, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $12, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $11, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $10, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $9, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $8, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $7, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $6, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $5, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $4, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $3, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $2, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $1, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftrw $15, %k2, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; KNL-NEXT: vmovups (%rdi), %zmm3 -; KNL-NEXT: vcmpltps %zmm0, %zmm3, %k1 +; KNL-NEXT: vmovups (%rdi), %zmm2 +; KNL-NEXT: vmovups 64(%rdi), %zmm3 +; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1 ; KNL-NEXT: kshiftlw $14, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1965,138 +1910,202 @@ ; KNL-NEXT: kshiftrw $15, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z} +; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $15, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: vmovd %ecx, %xmm2 +; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $13, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $12, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $11, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $10, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $9, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $8, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $7, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $6, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $5, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $4, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $3, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $2, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftlw $1, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; KNL-NEXT: kshiftrw $15, %k2, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; KNL-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z} +; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z} ; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k0 -; KNL-NEXT: kshiftlw $14, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax -; KNL-NEXT: kshiftlw $15, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %ecx +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: vmovd %ecx, %xmm4 ; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $13, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $12, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $11, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $10, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $9, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $8, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $7, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $6, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $5, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $4, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $3, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $2, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $1, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 -; KNL-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z} -; KNL-NEXT: vcmpltps %zmm5, %zmm0, %k0 +; KNL-NEXT: vcmpltps %zmm3, %zmm0, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vmovd %ecx, %xmm5 -; KNL-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 +; KNL-NEXT: vmovd %ecx, %xmm3 +; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; KNL-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 +; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2 ; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3 ; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 @@ -2119,6 +2128,7 @@ ; KNL-NEXT: LBB42_3: ## %End ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: ktest_2: @@ -2565,6 +2575,7 @@ ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: load_2i1: @@ -2602,6 +2613,7 @@ ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: load_4i1: @@ -2730,6 +2742,7 @@ ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: store_8i1: @@ -2768,6 +2781,7 @@ ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: store_8i1_1: @@ -2806,6 +2820,7 @@ ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: store_16i1: @@ -2847,6 +2862,7 @@ ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: store_32i1: @@ -2897,6 +2913,7 @@ ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: store_32i1_1: @@ -2941,6 +2958,36 @@ ; ; KNL-LABEL: store_64i1: ; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi9: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: pushq %r15 +; KNL-NEXT: Lcfi10: +; KNL-NEXT: .cfi_def_cfa_offset 24 +; KNL-NEXT: pushq %r14 +; KNL-NEXT: Lcfi11: +; KNL-NEXT: .cfi_def_cfa_offset 32 +; KNL-NEXT: pushq %r13 +; KNL-NEXT: Lcfi12: +; KNL-NEXT: .cfi_def_cfa_offset 40 +; KNL-NEXT: pushq %r12 +; KNL-NEXT: Lcfi13: +; KNL-NEXT: .cfi_def_cfa_offset 48 +; KNL-NEXT: pushq %rbx +; KNL-NEXT: Lcfi14: +; KNL-NEXT: .cfi_def_cfa_offset 56 +; KNL-NEXT: Lcfi15: +; KNL-NEXT: .cfi_offset %rbx, -56 +; KNL-NEXT: Lcfi16: +; KNL-NEXT: .cfi_offset %r12, -48 +; KNL-NEXT: Lcfi17: +; KNL-NEXT: .cfi_offset %r13, -40 +; KNL-NEXT: Lcfi18: +; KNL-NEXT: .cfi_offset %r14, -32 +; KNL-NEXT: Lcfi19: +; KNL-NEXT: .cfi_offset %r15, -24 +; KNL-NEXT: Lcfi20: +; KNL-NEXT: .cfi_offset %rbp, -16 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -2952,66 +2999,66 @@ ; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vmovd %ecx, %xmm3 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftlw $11, %k0, %k1 -; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftlw $10, %k0, %k1 -; KNL-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r15d ; KNL-NEXT: kshiftlw $9, %k0, %k1 -; KNL-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r12d ; KNL-NEXT: kshiftlw $8, %k0, %k1 -; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftlw $7, %k0, %k1 -; KNL-NEXT: vpinsrb $5, %edx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %ebx ; KNL-NEXT: kshiftlw $6, %k0, %k1 -; KNL-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftlw $5, %k0, %k1 -; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $4, %k0, %k1 -; KNL-NEXT: vpinsrb $8, %edx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $3, %k0, %k1 -; KNL-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftlw $2, %k0, %k1 -; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftlw $1, %k0, %k1 -; KNL-NEXT: vpinsrb $11, %edx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: vmovd %r9d, %xmm3 +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 -; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm2 -; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2 ; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2 +; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 @@ -3020,66 +3067,66 @@ ; KNL-NEXT: kmovw %k0, 6(%rdi) ; KNL-NEXT: kshiftlw $14, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %r8d ; KNL-NEXT: kshiftlw $15, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: kshiftlw $13, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %r9d ; KNL-NEXT: kshiftlw $12, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %ecx, %xmm2 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %r11d ; KNL-NEXT: kshiftlw $11, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %r14d ; KNL-NEXT: kshiftlw $10, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %r15d ; KNL-NEXT: kshiftlw $9, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %r12d ; KNL-NEXT: kshiftlw $8, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %r13d ; KNL-NEXT: kshiftlw $7, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $6, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %esi ; KNL-NEXT: kshiftlw $5, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %ebp ; KNL-NEXT: kshiftlw $4, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $8, %edx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %ebx ; KNL-NEXT: kshiftlw $3, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $2, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: kshiftlw $1, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $11, %edx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vmovd %r10d, %xmm2 +; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL-NEXT: kshiftrw $15, %k2, %k0 -; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm1 -; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1 +; KNL-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -3088,139 +3135,146 @@ ; KNL-NEXT: kmovw %k0, 4(%rdi) ; KNL-NEXT: kshiftlw $14, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %r8d ; KNL-NEXT: kshiftlw $15, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: kshiftlw $13, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %r9d ; KNL-NEXT: kshiftlw $12, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %ecx, %xmm1 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %r11d ; KNL-NEXT: kshiftlw $11, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %r14d ; KNL-NEXT: kshiftlw $10, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %r15d ; KNL-NEXT: kshiftlw $9, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %r12d ; KNL-NEXT: kshiftlw $8, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %r13d ; KNL-NEXT: kshiftlw $7, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $5, %edx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $6, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %esi ; KNL-NEXT: kshiftlw $5, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %ebp ; KNL-NEXT: kshiftlw $4, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $8, %edx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %ebx ; KNL-NEXT: kshiftlw $3, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $2, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: kshiftlw $1, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $11, %edx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vmovd %r10d, %xmm1 +; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm0 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0 +; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: kmovw %k1, 2(%rdi) ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vmovd %edx, %xmm1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftlw $11, %k0, %k1 -; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftlw $10, %k0, %k1 -; KNL-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r15d ; KNL-NEXT: kshiftlw $9, %k0, %k1 -; KNL-NEXT: vpinsrb $3, %edx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r12d ; KNL-NEXT: kshiftlw $8, %k0, %k1 -; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftlw $7, %k0, %k1 -; KNL-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftlw $6, %k0, %k1 -; KNL-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftlw $5, %k0, %k1 -; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftlw $4, %k0, %k1 -; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %ebx ; KNL-NEXT: kshiftlw $3, %k0, %k1 -; KNL-NEXT: vpinsrb $9, %edx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $2, %k0, %k1 -; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $1, %k0, %k1 -; KNL-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $12, %edx, %xmm1, %xmm1 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; KNL-NEXT: vmovd %r9d, %xmm0 +; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm0 ; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; KNL-NEXT: kmovw %k0, 2(%rdi) ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) +; KNL-NEXT: popq %rbx +; KNL-NEXT: popq %r12 +; KNL-NEXT: popq %r13 +; KNL-NEXT: popq %r14 +; KNL-NEXT: popq %r15 +; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: store_64i1: @@ -3571,6 +3625,7 @@ ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movzbl %al, %eax ; KNL-NEXT: addl %eax, %eax +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_bitcast_v8i1_zext: @@ -3609,40 +3664,14 @@ } define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { -; KNL-LABEL: test_bitcast_v16i1_zext: -; KNL: ## BB#0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: addl %eax, %eax -; KNL-NEXT: retq -; -; SKX-LABEL: test_bitcast_v16i1_zext: -; SKX: ## BB#0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: addl %eax, %eax -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq -; -; AVX512BW-LABEL: test_bitcast_v16i1_zext: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512BW-NEXT: kmovw %k0, %eax -; AVX512BW-NEXT: addl %eax, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: test_bitcast_v16i1_zext: -; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512DQ-NEXT: kmovw %k0, %eax -; AVX512DQ-NEXT: addl %eax, %eax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; CHECK-LABEL: test_bitcast_v16i1_zext: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: addl %eax, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %v1 = icmp eq <16 x i32> %a, zeroinitializer %mask1 = bitcast <16 x i1> %v1 to i16 %val = zext i16 %mask1 to i32 Index: test/CodeGen/X86/avx512-vec-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-vec-cmp.ll +++ test/CodeGen/X86/avx512-vec-cmp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=SKX define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind { ; CHECK-LABEL: test1: @@ -8,7 +8,6 @@ ; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1 ; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask = fcmp ole <16 x float> %x, %y %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y ret <16 x float> %max @@ -20,7 +19,6 @@ ; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1 ; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask = fcmp ole <8 x double> %x, %y %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y ret <8 x double> %max @@ -32,7 +30,6 @@ ; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %y = load <16 x i32>, <16 x i32>* %yp, align 4 %mask = icmp eq <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 @@ -45,7 +42,6 @@ ; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask = icmp uge <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y ret <16 x i32> %max @@ -57,7 +53,6 @@ ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask = icmp eq <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y ret <8 x i64> %max @@ -69,7 +64,6 @@ ; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask = icmp ugt <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y ret <8 x i64> %max @@ -123,14 +117,12 @@ ; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test9: ; SKX: ## BB#0: ; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 ; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %mask = icmp eq <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y ret <8 x i32> %max @@ -145,14 +137,12 @@ ; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test10: ; SKX: ## BB#0: ; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1 ; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %mask = fcmp oeq <8 x float> %x, %y %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y @@ -164,7 +154,6 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask = icmp ugt <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y ret <8 x i32> %max @@ -178,8 +167,8 @@ ; KNL-NEXT: kunpckbw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test12: ; SKX: ## BB#0: @@ -190,7 +179,6 @@ ; SKX-NEXT: ## kill: %AX %AX %EAX ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %res = icmp eq <16 x i64> %a, %b %res1 = bitcast <16 x i1> %res to i16 ret i16 %res1 @@ -269,6 +257,8 @@ ; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 ; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 @@ -325,13 +315,11 @@ ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -342,8 +330,8 @@ ; KNL-NEXT: movl (%rsp), %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test12_v32i32: ; SKX: ## BB#0: @@ -353,7 +341,6 @@ ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %res = icmp eq <32 x i32> %a, %b %res1 = bitcast <32 x i1> %res to i32 ret i32 %res1 @@ -577,75 +564,75 @@ ; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vmovd %ecx, %xmm1 -; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 -; KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -656,8 +643,8 @@ ; KNL-NEXT: orq %rcx, %rax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test12_v64i16: ; SKX: ## BB#0: @@ -667,7 +654,6 @@ ; SKX-NEXT: kmovq %k0, %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %res = icmp eq <64 x i16> %a, %b %res1 = bitcast <64 x i1> %res to i64 ret i64 %res1 @@ -721,7 +707,6 @@ ; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1 ; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask = icmp sge <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y ret <16 x i32> %max @@ -733,7 +718,6 @@ ; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp sgt <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 @@ -746,7 +730,6 @@ ; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp sle <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 @@ -759,7 +742,6 @@ ; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp ule <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 @@ -773,7 +755,6 @@ ; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1} ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask1 = icmp eq <16 x i32> %x1, %y1 %mask0 = icmp eq <16 x i32> %x, %y %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer @@ -788,7 +769,6 @@ ; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1} ; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask1 = icmp sge <8 x i64> %x1, %y1 %mask0 = icmp sle <8 x i64> %x, %y %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer @@ -803,7 +783,6 @@ ; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1} ; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask1 = icmp sgt <8 x i64> %x1, %y1 %y = load <8 x i64>, <8 x i64>* %y.ptr, align 4 %mask0 = icmp sgt <8 x i64> %x, %y @@ -819,7 +798,6 @@ ; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1} ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask1 = icmp sge <16 x i32> %x1, %y1 %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask0 = icmp ule <16 x i32> %x, %y @@ -834,7 +812,6 @@ ; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1 ; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer @@ -849,7 +826,6 @@ ; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer @@ -865,7 +841,6 @@ ; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1} ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask1 = icmp sge <16 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 @@ -883,7 +858,6 @@ ; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1} ; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %mask1 = icmp sge <8 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 @@ -926,6 +900,7 @@ ; KNL-NEXT: kxorw %k1, %k0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test29: @@ -949,14 +924,12 @@ ; KNL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2 ; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test30: ; SKX: ## BB#0: ; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 ; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %mask = fcmp oeq <4 x double> %x, %y %max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %y @@ -969,14 +942,12 @@ ; KNL-NEXT: vcmpltpd (%rdi), %xmm0, %xmm2 ; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test31: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1 ; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %y = load <2 x double>, <2 x double>* %yp, align 4 %mask = fcmp olt <2 x double> %x, %y @@ -990,14 +961,12 @@ ; KNL-NEXT: vcmpltpd (%rdi), %ymm0, %ymm2 ; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test32: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1 ; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %y = load <4 x double>, <4 x double>* %yp, align 4 %mask = fcmp ogt <4 x double> %y, %x @@ -1011,7 +980,6 @@ ; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1 ; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %y = load <8 x double>, <8 x double>* %yp, align 4 %mask = fcmp olt <8 x double> %x, %y %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1 @@ -1024,14 +992,12 @@ ; KNL-NEXT: vcmpltps (%rdi), %xmm0, %xmm2 ; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test34: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1 ; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %y = load <4 x float>, <4 x float>* %yp, align 4 %mask = fcmp olt <4 x float> %x, %y %max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1 @@ -1048,14 +1014,12 @@ ; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test35: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1 ; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %y = load <8 x float>, <8 x float>* %yp, align 4 %mask = fcmp ogt <8 x float> %y, %x @@ -1069,7 +1033,6 @@ ; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1 ; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %y = load <16 x float>, <16 x float>* %yp, align 4 %mask = fcmp olt <16 x float> %x, %y %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1 @@ -1082,7 +1045,6 @@ ; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 ; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %a = load double, double* %ptr %v = insertelement <8 x double> undef, double %a, i32 0 @@ -1100,14 +1062,12 @@ ; KNL-NEXT: vcmpltpd %ymm2, %ymm0, %ymm2 ; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test38: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1 ; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %a = load double, double* %ptr %v = insertelement <4 x double> undef, double %a, i32 0 @@ -1125,14 +1085,12 @@ ; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2 ; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test39: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1 ; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %a = load double, double* %ptr %v = insertelement <2 x double> undef, double %a, i32 0 @@ -1150,7 +1108,6 @@ ; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1 ; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %a = load float, float* %ptr %v = insertelement <16 x float> undef, float %a, i32 0 @@ -1171,14 +1128,12 @@ ; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test41: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1 ; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %a = load float, float* %ptr %v = insertelement <8 x float> undef, float %a, i32 0 @@ -1196,14 +1151,12 @@ ; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2 ; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test42: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1 ; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %a = load float, float* %ptr %v = insertelement <4 x float> undef, float %a, i32 0 @@ -1223,7 +1176,6 @@ ; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} ; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq -; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test43: ; SKX: ## BB#0: @@ -1232,7 +1184,6 @@ ; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} ; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq -; SKX-NEXT: ## -- End function %a = load double, double* %ptr %v = insertelement <8 x double> undef, double %a, i32 0 Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) @@ -11,8 +11,8 @@ ; AVX512BW-NEXT: vpbroadcastb %edi, %zmm1 ; AVX512BW-NEXT: kmovq %rsi, %k1 ; AVX512BW-NEXT: vpbroadcastb %edi, %zmm0 {%k1} -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpbroadcastb %edi, %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpbroadcastb %edi, %zmm2 {%k1} {z} +; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -22,8 +22,8 @@ ; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm1 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm0 {%k1} -; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm1 {%k1} {z} +; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm2 {%k1} {z} +; AVX512F-32-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) @@ -41,8 +41,8 @@ ; AVX512BW-NEXT: vpbroadcastw %edi, %zmm1 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpbroadcastw %edi, %zmm0 {%k1} -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpbroadcastw %edi, %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpbroadcastw %edi, %zmm2 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -52,8 +52,8 @@ ; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm1 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm0 {%k1} -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm1 {%k1} {z} +; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm2 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) @@ -72,6 +72,7 @@ ; AVX512BW-NEXT: kmovq %rdx, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512: @@ -81,6 +82,7 @@ ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1} ; AVX512F-32-NEXT: vmovdqu32 %zmm0, (%eax) +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2) call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1) @@ -95,6 +97,7 @@ ; AVX512BW-NEXT: kmovd %edx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512: @@ -104,6 +107,7 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%ecx) {%k1} ; AVX512F-32-NEXT: vmovdqu32 %zmm0, (%eax) +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2) call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1) @@ -253,8 +257,8 @@ ; AVX512BW-NEXT: kmovq %rdi, %k1 ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] -; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512: @@ -263,8 +267,8 @@ ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] ; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] -; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4) %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4) @@ -283,8 +287,8 @@ ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] ; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufh_w_512: @@ -293,8 +297,8 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] ; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) @@ -313,8 +317,8 @@ ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] ; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufl_w_512: @@ -323,8 +327,8 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] ; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) @@ -339,6 +343,7 @@ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_pcmpeq_b: @@ -351,6 +356,7 @@ ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) ret i64 %res @@ -362,6 +368,7 @@ ; AVX512BW-NEXT: kmovq %rdi, %k1 ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_pcmpeq_b: @@ -375,6 +382,7 @@ ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) ret i64 %res @@ -387,12 +395,14 @@ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_pcmpeq_w: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1) ret i32 %res @@ -404,6 +414,7 @@ ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_pcmpeq_w: @@ -411,6 +422,7 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask) ret i32 %res @@ -423,6 +435,7 @@ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_pcmpgt_b: @@ -435,6 +448,7 @@ ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) ret i64 %res @@ -446,6 +460,7 @@ ; AVX512BW-NEXT: kmovq %rdi, %k1 ; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_pcmpgt_b: @@ -459,6 +474,7 @@ ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) ret i64 %res @@ -471,12 +487,14 @@ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_pcmpgt_w: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1) ret i32 %res @@ -488,6 +506,7 @@ ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_pcmpgt_w: @@ -495,6 +514,7 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask) ret i32 %res @@ -799,8 +819,8 @@ ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512: @@ -809,8 +829,8 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2) @@ -829,8 +849,8 @@ ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1} ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512: @@ -839,8 +859,8 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1} ; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2) @@ -858,8 +878,8 @@ ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; @@ -868,8 +888,8 @@ ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm3, %zmm0 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) @@ -888,8 +908,8 @@ ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -898,8 +918,8 @@ ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) @@ -919,8 +939,8 @@ ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_w_512: @@ -929,8 +949,8 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1} ; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: vpaddw %zmm3, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) @@ -949,8 +969,8 @@ ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsraw $3, %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vpsraw $3, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_wi_512: @@ -959,8 +979,8 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm1 {%k1} ; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) @@ -979,8 +999,8 @@ ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_w_512: @@ -989,8 +1009,8 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1} ; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: vpaddw %zmm3, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) @@ -1009,8 +1029,8 @@ ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_wi_512: @@ -1019,8 +1039,8 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm1 {%k1} ; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) @@ -1650,6 +1670,7 @@ ; AVX512BW-NEXT: kxnorq %k0, %k0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax ; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_cmp_b_512: @@ -1664,28 +1685,29 @@ ; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq %k0, (%esp) ; AVX512F-32-NEXT: addl (%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: kxnorq %k0, %k0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $60, %esp +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1) @@ -1730,33 +1752,26 @@ ; AVX512BW-NEXT: kmovq %k0, %rax ; AVX512BW-NEXT: addq %rcx, %rax ; AVX512BW-NEXT: addq %rdi, %rax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_cmp_b_512: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: pushl %ebx ; AVX512F-32-NEXT: .Lcfi5: ; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 -; AVX512F-32-NEXT: pushl %ebx +; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: .Lcfi6: ; AVX512F-32-NEXT: .cfi_def_cfa_offset 12 -; AVX512F-32-NEXT: pushl %edi +; AVX512F-32-NEXT: subl $60, %esp ; AVX512F-32-NEXT: .Lcfi7: -; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 -; AVX512F-32-NEXT: pushl %esi +; AVX512F-32-NEXT: .cfi_def_cfa_offset 72 ; AVX512F-32-NEXT: .Lcfi8: -; AVX512F-32-NEXT: .cfi_def_cfa_offset 20 -; AVX512F-32-NEXT: subl $60, %esp +; AVX512F-32-NEXT: .cfi_offset %esi, -12 ; AVX512F-32-NEXT: .Lcfi9: -; AVX512F-32-NEXT: .cfi_def_cfa_offset 80 -; AVX512F-32-NEXT: .Lcfi10: -; AVX512F-32-NEXT: .cfi_offset %esi, -20 -; AVX512F-32-NEXT: .Lcfi11: -; AVX512F-32-NEXT: .cfi_offset %edi, -16 -; AVX512F-32-NEXT: .Lcfi12: -; AVX512F-32-NEXT: .cfi_offset %ebx, -12 -; AVX512F-32-NEXT: .Lcfi13: -; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: .cfi_offset %ebx, -8 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $5, %al @@ -1777,39 +1792,39 @@ ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 ; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %edx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %ebx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $6, %al @@ -1818,8 +1833,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1828,8 +1843,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %al @@ -1837,8 +1852,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: andb $2, %al @@ -1847,8 +1862,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl @@ -1859,8 +1874,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al @@ -1868,631 +1883,639 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl $1036, %edi # imm = 0x40C -; AVX512F-32-NEXT: bextrl %edi, %ecx, %eax +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $12, %eax +; AVX512F-32-NEXT: andl $15, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: movl %ecx, %ebp ; AVX512F-32-NEXT: shrl $13, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl $526, %edx # imm = 0x20E -; AVX512F-32-NEXT: bextrl %edx, %ebp, %eax +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $14, %eax +; AVX512F-32-NEXT: andl $3, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: movl $271, %esi # imm = 0x10F -; AVX512F-32-NEXT: bextrl %esi, %ebp, %eax ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $15, %eax +; AVX512F-32-NEXT: andl $1, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: shrl $16, %ecx -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $16, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andb $2, %al -; AVX512F-32-NEXT: shrb %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %ebx +; AVX512F-32-NEXT: movl %eax, %ebx ; AVX512F-32-NEXT: andb $15, %bl -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: movl %ebx, %edx ; AVX512F-32-NEXT: shrb $2, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: shrb $3, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $4, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: shrb $4, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $5, %al -; AVX512F-32-NEXT: andb $1, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: shrb $5, %dl +; AVX512F-32-NEXT: andb $1, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $6, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: shrb $6, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: # kill: %CL %CL %ECX %ECX -; AVX512F-32-NEXT: shrb $7, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: # kill: %AL %AL %EAX %EAX +; AVX512F-32-NEXT: shrb $7, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebp, %ebx -; AVX512F-32-NEXT: shrl $24, %ebx -; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: andb $2, %al -; AVX512F-32-NEXT: shrb %al -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 -; AVX512F-32-NEXT: # kill: %BL %BL %EBX %EBX -; AVX512F-32-NEXT: andb $15, %bl -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $2, %bl -; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 -; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 -; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 -; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 -; AVX512F-32-NEXT: movl %ebp, %ecx ; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $28, %eax -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 -; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5 -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm5, %ymm5 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $29, %eax -; AVX512F-32-NEXT: andb $1, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebp, %eax -; AVX512F-32-NEXT: shrl $30, %eax +; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebp, %eax -; AVX512F-32-NEXT: shrl $31, %eax -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: andb $2, %al -; AVX512F-32-NEXT: shrb %al -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %ecx -; AVX512F-32-NEXT: andb $15, %cl -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $2, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $4, %al +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $5, %al +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ecx, %esi +; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $6, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $7, %al +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %esi, %eax +; AVX512F-32-NEXT: shrl $30, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bh, %al +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %esi, %eax +; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bh, %cl -; AVX512F-32-NEXT: andb $15, %cl -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $2, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %edx +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: bextrl %edi, %ebx, %eax +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrl $13, %eax +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: bextrl %edx, %ebx, %eax +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: bextrl %esi, %ebx, %eax -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrl $16, %eax +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movb %ch, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: andb $2, %cl -; AVX512F-32-NEXT: shrb %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: andb $15, %cl -; AVX512F-32-NEXT: movl %ecx, %edx -; AVX512F-32-NEXT: shrb $2, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: andb $2, %al +; AVX512F-32-NEXT: shrb %al +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movb %ch, %dl +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: shrb $3, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $12, %eax +; AVX512F-32-NEXT: andl $15, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $13, %eax +; AVX512F-32-NEXT: andb $1, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $14, %eax +; AVX512F-32-NEXT: andl $3, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $15, %eax +; AVX512F-32-NEXT: andl $1, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %ebx +; AVX512F-32-NEXT: shrl $16, %ebx +; AVX512F-32-NEXT: kmovd %ebx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ebx, %edx +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: andb $15, %al +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: shrb $2, %al +; AVX512F-32-NEXT: kmovd %eax, %k0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 ; AVX512F-32-NEXT: shrb $3, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: shrb $4, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: shrb $5, %cl -; AVX512F-32-NEXT: andb $1, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: shrb $6, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: # kill: %AL %AL %EAX %EAX -; AVX512F-32-NEXT: shrb $7, %al +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 ; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrl $24, %eax +; AVX512F-32-NEXT: shrb $5, %al +; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: andb $2, %cl -; AVX512F-32-NEXT: shrb %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: shrb $6, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: # kill: %BL %BL %EBX %EBX +; AVX512F-32-NEXT: shrb $7, %bl +; AVX512F-32-NEXT: kmovd %ebx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $2, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: shrl $24, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: kmovd %edx, %k0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 ; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4 -; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1 +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 -; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4 -; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $30, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 -; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0 ; AVX512F-32-NEXT: kshiftrq $1, %k0, %k0 ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: korq %k1, %k0, %k1 -; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} +; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1} +; AVX512F-32-NEXT: vpcmpgtb %zmm5, %zmm6, %k2 {%k1} +; AVX512F-32-NEXT: vpcmpleb %zmm6, %zmm5, %k3 {%k1} +; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1} +; AVX512F-32-NEXT: vpcmpleb %zmm5, %zmm6, %k5 {%k1} +; AVX512F-32-NEXT: vpcmpgtb %zmm6, %zmm5, %k1 {%k1} ; AVX512F-32-NEXT: kmovq %k0, (%esp) ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kmovq %k3, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: kxorq %k0, %k0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k2 {%k1} -; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: kmovq %k4, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kmovq %k5, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl %ebp, %eax -; AVX512F-32-NEXT: adcxl %ebx, %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: addl %esi, %eax +; AVX512F-32-NEXT: adcl %ecx, %edx ; AVX512F-32-NEXT: addl $60, %esp ; AVX512F-32-NEXT: popl %esi -; AVX512F-32-NEXT: popl %edi ; AVX512F-32-NEXT: popl %ebx -; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask) @@ -2537,12 +2560,13 @@ ; AVX512BW-NEXT: kxnorq %k0, %k0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax ; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_ucmp_b_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: subl $60, %esp -; AVX512F-32-NEXT: .Lcfi14: +; AVX512F-32-NEXT: .Lcfi10: ; AVX512F-32-NEXT: .cfi_def_cfa_offset 64 ; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) @@ -2551,28 +2575,29 @@ ; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq %k0, (%esp) ; AVX512F-32-NEXT: addl (%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: kxnorq %k0, %k0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $60, %esp +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1) @@ -2617,33 +2642,26 @@ ; AVX512BW-NEXT: kmovq %k0, %rax ; AVX512BW-NEXT: addq %rcx, %rax ; AVX512BW-NEXT: addq %rdi, %rax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: pushl %ebp -; AVX512F-32-NEXT: .Lcfi15: -; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 ; AVX512F-32-NEXT: pushl %ebx -; AVX512F-32-NEXT: .Lcfi16: -; AVX512F-32-NEXT: .cfi_def_cfa_offset 12 -; AVX512F-32-NEXT: pushl %edi -; AVX512F-32-NEXT: .Lcfi17: -; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-32-NEXT: .Lcfi11: +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 ; AVX512F-32-NEXT: pushl %esi -; AVX512F-32-NEXT: .Lcfi18: -; AVX512F-32-NEXT: .cfi_def_cfa_offset 20 +; AVX512F-32-NEXT: .Lcfi12: +; AVX512F-32-NEXT: .cfi_def_cfa_offset 12 ; AVX512F-32-NEXT: subl $60, %esp -; AVX512F-32-NEXT: .Lcfi19: -; AVX512F-32-NEXT: .cfi_def_cfa_offset 80 -; AVX512F-32-NEXT: .Lcfi20: -; AVX512F-32-NEXT: .cfi_offset %esi, -20 -; AVX512F-32-NEXT: .Lcfi21: -; AVX512F-32-NEXT: .cfi_offset %edi, -16 -; AVX512F-32-NEXT: .Lcfi22: -; AVX512F-32-NEXT: .cfi_offset %ebx, -12 -; AVX512F-32-NEXT: .Lcfi23: -; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: .Lcfi13: +; AVX512F-32-NEXT: .cfi_def_cfa_offset 72 +; AVX512F-32-NEXT: .Lcfi14: +; AVX512F-32-NEXT: .cfi_offset %esi, -12 +; AVX512F-32-NEXT: .Lcfi15: +; AVX512F-32-NEXT: .cfi_offset %ebx, -8 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $5, %al @@ -2664,39 +2682,39 @@ ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 ; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %edx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %ebx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $6, %al @@ -2705,8 +2723,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2715,8 +2733,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %al @@ -2724,8 +2742,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: andb $2, %al @@ -2734,8 +2752,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl @@ -2746,8 +2764,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al @@ -2755,631 +2773,639 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl $1036, %edi # imm = 0x40C -; AVX512F-32-NEXT: bextrl %edi, %ecx, %eax +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $12, %eax +; AVX512F-32-NEXT: andl $15, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: movl %ecx, %ebp ; AVX512F-32-NEXT: shrl $13, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl $526, %edx # imm = 0x20E -; AVX512F-32-NEXT: bextrl %edx, %ebp, %eax +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $14, %eax +; AVX512F-32-NEXT: andl $3, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: movl $271, %esi # imm = 0x10F -; AVX512F-32-NEXT: bextrl %esi, %ebp, %eax ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $15, %eax +; AVX512F-32-NEXT: andl $1, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: shrl $16, %ecx -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $16, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andb $2, %al -; AVX512F-32-NEXT: shrb %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %ebx +; AVX512F-32-NEXT: movl %eax, %ebx ; AVX512F-32-NEXT: andb $15, %bl -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: movl %ebx, %edx ; AVX512F-32-NEXT: shrb $2, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: shrb $3, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $4, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: shrb $4, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $5, %al -; AVX512F-32-NEXT: andb $1, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: shrb $5, %dl +; AVX512F-32-NEXT: andb $1, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $6, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: shrb $6, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: # kill: %CL %CL %ECX %ECX -; AVX512F-32-NEXT: shrb $7, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: # kill: %AL %AL %EAX %EAX +; AVX512F-32-NEXT: shrb $7, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebp, %ebx -; AVX512F-32-NEXT: shrl $24, %ebx -; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: andb $2, %al -; AVX512F-32-NEXT: shrb %al -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 -; AVX512F-32-NEXT: # kill: %BL %BL %EBX %EBX -; AVX512F-32-NEXT: andb $15, %bl -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $2, %bl -; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 -; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 -; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 -; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 -; AVX512F-32-NEXT: movl %ebp, %ecx ; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $28, %eax -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 -; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5 -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm5, %ymm5 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $29, %eax -; AVX512F-32-NEXT: andb $1, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebp, %eax -; AVX512F-32-NEXT: shrl $30, %eax +; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebp, %eax -; AVX512F-32-NEXT: shrl $31, %eax -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: andb $2, %al -; AVX512F-32-NEXT: shrb %al -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %ecx -; AVX512F-32-NEXT: andb $15, %cl -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $2, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $4, %al +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $5, %al +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ecx, %esi +; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %esi, %eax +; AVX512F-32-NEXT: shrl $30, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %esi, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: andb $2, %al +; AVX512F-32-NEXT: shrb %al +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %edx +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: shrb $3, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $4, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $5, %al +; AVX512F-32-NEXT: andb $1, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bh, %al +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movb %ch, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bh, %cl -; AVX512F-32-NEXT: andb $15, %cl -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $2, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movb %ch, %dl +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: bextrl %edi, %ebx, %eax -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $12, %eax +; AVX512F-32-NEXT: andl $15, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $13, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: bextrl %edx, %ebx, %eax -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: bextrl %esi, %ebx, %eax -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $14, %eax +; AVX512F-32-NEXT: andl $3, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $15, %eax +; AVX512F-32-NEXT: andl $1, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %ebx +; AVX512F-32-NEXT: shrl $16, %ebx +; AVX512F-32-NEXT: kmovd %ebx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ebx, %edx +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrl $16, %eax -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: andb $2, %cl -; AVX512F-32-NEXT: shrb %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: andb $15, %cl -; AVX512F-32-NEXT: movl %ecx, %edx -; AVX512F-32-NEXT: shrb $2, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: andb $15, %al +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: shrb $2, %al +; AVX512F-32-NEXT: kmovd %eax, %k0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 ; AVX512F-32-NEXT: shrb $3, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: shrb $4, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: shrb $5, %cl -; AVX512F-32-NEXT: andb $1, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: shrb $6, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: # kill: %AL %AL %EAX %EAX -; AVX512F-32-NEXT: shrb $7, %al +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 ; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrl $24, %eax +; AVX512F-32-NEXT: shrb $5, %al +; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: andb $2, %cl -; AVX512F-32-NEXT: shrb %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: shrb $6, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: # kill: %BL %BL %EBX %EBX +; AVX512F-32-NEXT: shrb $7, %bl +; AVX512F-32-NEXT: kmovd %ebx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrb $2, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: shrl $24, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: kmovd %edx, %k0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 ; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4 -; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1 +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 -; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4 -; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $30, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 -; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0 ; AVX512F-32-NEXT: kshiftrq $1, %k0, %k0 ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: korq %k1, %k0, %k1 -; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} +; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1} +; AVX512F-32-NEXT: vpcmpltub %zmm6, %zmm5, %k2 {%k1} +; AVX512F-32-NEXT: vpcmpleub %zmm6, %zmm5, %k3 {%k1} +; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1} +; AVX512F-32-NEXT: vpcmpnltub %zmm6, %zmm5, %k5 {%k1} +; AVX512F-32-NEXT: vpcmpnleub %zmm6, %zmm5, %k1 {%k1} ; AVX512F-32-NEXT: kmovq %k0, (%esp) ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kmovq %k3, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: kxorq %k0, %k0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k2 {%k1} -; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: kmovq %k4, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kmovq %k5, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl %ebp, %eax -; AVX512F-32-NEXT: adcxl %ebx, %edx +; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: addl %esi, %eax +; AVX512F-32-NEXT: adcl %ecx, %edx ; AVX512F-32-NEXT: addl $60, %esp ; AVX512F-32-NEXT: popl %esi -; AVX512F-32-NEXT: popl %edi ; AVX512F-32-NEXT: popl %ebx -; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask) @@ -3424,6 +3450,7 @@ ; AVX512BW-NEXT: kxnord %k0, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_cmp_w_512: @@ -3448,6 +3475,7 @@ ; AVX512F-32-NEXT: kxnord %k0, %k0, %k0 ; AVX512F-32-NEXT: kmovd %k0, %eax ; AVX512F-32-NEXT: addl %ecx, %eax +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1) @@ -3492,6 +3520,7 @@ ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: addl %ecx, %eax ; AVX512BW-NEXT: addl %edi, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_cmp_w_512: @@ -3519,6 +3548,7 @@ ; AVX512F-32-NEXT: kmovd %k0, %eax ; AVX512F-32-NEXT: addl %edx, %eax ; AVX512F-32-NEXT: addl %ecx, %eax +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask) @@ -3563,6 +3593,7 @@ ; AVX512BW-NEXT: kxnord %k0, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_ucmp_w_512: @@ -3587,6 +3618,7 @@ ; AVX512F-32-NEXT: kxnord %k0, %k0, %k0 ; AVX512F-32-NEXT: kmovd %k0, %eax ; AVX512F-32-NEXT: addl %ecx, %eax +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1) @@ -3631,6 +3663,7 @@ ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: addl %ecx, %eax ; AVX512BW-NEXT: addl %edi, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_ucmp_w_512: @@ -3658,6 +3691,7 @@ ; AVX512F-32-NEXT: kmovd %k0, %eax ; AVX512F-32-NEXT: addl %edx, %eax ; AVX512F-32-NEXT: addl %ecx, %eax +; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask) Index: test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl --show-mc-encoding| FileCheck %s declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16) @@ -9,8 +9,8 @@ ; CHECK-NEXT: vpbroadcastb %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xcf] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7] -; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; CHECK-NEXT: vpbroadcastb %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf] +; CHECK-NEXT: vpbroadcastb %edi, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xd7] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2] ; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) @@ -30,8 +30,8 @@ ; CHECK-NEXT: vpbroadcastw %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xcf] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; CHECK-NEXT: vpbroadcastw %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf] +; CHECK-NEXT: vpbroadcastw %edi, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xd7] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] ; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) @@ -51,8 +51,8 @@ ; CHECK-NEXT: vpbroadcastb %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xcf] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7] -; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] -; CHECK-NEXT: vpbroadcastb %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf] +; CHECK-NEXT: vpbroadcastb %edi, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xd7] +; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2] ; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) @@ -73,8 +73,8 @@ ; CHECK-NEXT: vpbroadcastw %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xcf] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; CHECK-NEXT: vpbroadcastw %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf] +; CHECK-NEXT: vpbroadcastw %edi, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xd7] +; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] ; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) @@ -93,9 +93,9 @@ ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] -; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] -; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1] +; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) @@ -113,9 +113,9 @@ ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] -; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] +; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) @@ -133,9 +133,9 @@ ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] -; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] -; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) @@ -153,9 +153,9 @@ ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] -; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) @@ -173,9 +173,9 @@ ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] -; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9] ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] -; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1] +; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; CHECK-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) @@ -193,9 +193,9 @@ ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] -; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9] ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] -; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] +; CHECK-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) @@ -227,6 +227,7 @@ ; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca] ; CHECK-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x7f,0x07] ; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr1, <32 x i8> %x1, i32 %x2) call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr2, <32 x i8> %x1, i32 -1) @@ -255,6 +256,7 @@ ; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca] ; CHECK-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7f,0x07] ; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr1, <16 x i16> %x1, i16 %x2) call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr2, <16 x i16> %x1, i16 -1) @@ -345,8 +347,8 @@ ; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] ; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] -; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; CHECK-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4) %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4) @@ -368,8 +370,8 @@ ; CHECK-NEXT: ## ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] ; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] -; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] ; CHECK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] +; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4) %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4) @@ -391,8 +393,8 @@ ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4] ; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) @@ -414,8 +416,8 @@ ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] ; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) @@ -437,8 +439,8 @@ ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7] ; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) @@ -460,8 +462,8 @@ ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] ; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) @@ -476,6 +478,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) ret i32 %res @@ -487,6 +490,7 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) ret i32 %res @@ -500,6 +504,7 @@ ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) ret i16 %res @@ -512,6 +517,7 @@ ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) ret i16 %res @@ -524,6 +530,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) ret i32 %res @@ -535,6 +542,7 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) ret i32 %res @@ -548,6 +556,7 @@ ; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) ret i16 %res @@ -560,6 +569,7 @@ ; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: ## kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) ret i16 %res @@ -1660,9 +1670,9 @@ ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] +; CHECK-NEXT: vpaddw %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc0] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) @@ -1680,8 +1690,8 @@ ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1] -; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1] +; CHECK-NEXT: vpaddw %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfd,0xc0] ; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) @@ -1701,8 +1711,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe1,0xd1] ; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe1,0xc1] -; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] ; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) @@ -1721,8 +1731,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe1,0xd1] ; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe1,0xc1] -; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) @@ -1741,8 +1751,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf1,0xd1] ; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf1,0xc1] -; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] ; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) @@ -1761,8 +1771,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf1,0xd1] ; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf1,0xc1] -; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) @@ -1780,9 +1790,9 @@ ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03] -; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) @@ -1800,8 +1810,8 @@ ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03] -; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) @@ -1821,8 +1831,8 @@ ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsraw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xe0,0x03] ; CHECK-NEXT: vpsraw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xe0,0x03] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) @@ -1841,8 +1851,8 @@ ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsraw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xe0,0x03] ; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xe0,0x03] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) @@ -1861,8 +1871,8 @@ ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsllw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xf0,0x03] ; CHECK-NEXT: vpsllw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xf0,0x03] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) @@ -1881,8 +1891,8 @@ ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsllw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xf0,0x03] ; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xf0,0x03] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) @@ -1936,8 +1946,8 @@ ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -1959,8 +1969,8 @@ ; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; CHECK-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) @@ -1980,8 +1990,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8] ; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2000,8 +2010,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8] ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) @@ -2020,8 +2030,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8] ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) @@ -2040,8 +2050,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8] ; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) @@ -2834,23 +2844,23 @@ ; CHECK-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0] ; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8] ; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02] -; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] -; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] +; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0] +; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0] ; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0] -; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02] +; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1] +; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02] ; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] -; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01] -; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02] +; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1] +; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] +; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] +; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01] +; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02] ; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03] ; CHECK-NEXT: vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8] ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] -; CHECK-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02] -; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03] +; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] +; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) @@ -2932,23 +2942,23 @@ ; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01] ; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8] ; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02] -; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] -; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] +; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0] +; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0] ; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0] -; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05] +; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06] +; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05] ; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] -; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01] -; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02] +; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06] +; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] +; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] +; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01] +; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02] ; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03] ; CHECK-NEXT: vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8] ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] -; CHECK-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02] -; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03] +; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] +; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) @@ -2997,6 +3007,7 @@ ; CHECK-NEXT: kxnorw %k0, %k0, %k0 ## encoding: [0xc5,0xfc,0x46,0xc0] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 @@ -3043,6 +3054,7 @@ ; CHECK-NEXT: kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1] ; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] ; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 @@ -3090,6 +3102,7 @@ ; CHECK-NEXT: kxnorw %k0, %k0, %k0 ## encoding: [0xc5,0xfc,0x46,0xc0] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 @@ -3136,6 +3149,7 @@ ; CHECK-NEXT: kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1] ; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] ; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 Index: test/CodeGen/X86/avx512vl-vec-masked-cmp.ll =================================================================== --- test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=VLX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=VLX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask: @@ -19,8 +19,23 @@ ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .Lcfi2: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi3: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi4: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi5: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi6: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi7: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -29,64 +44,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -95,8 +110,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -117,15 +138,30 @@ ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi3: +; NoVLX-NEXT: .Lcfi8: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi4: +; NoVLX-NEXT: .Lcfi9: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi5: +; NoVLX-NEXT: .Lcfi10: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi11: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi12: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi13: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi14: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi15: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -134,64 +170,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -200,8 +236,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -224,15 +266,30 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi6: +; NoVLX-NEXT: .Lcfi16: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi7: +; NoVLX-NEXT: .Lcfi17: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi8: +; NoVLX-NEXT: .Lcfi18: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi19: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi20: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi21: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi22: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi23: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -242,64 +299,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -308,8 +365,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -333,15 +396,30 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi9: +; NoVLX-NEXT: .Lcfi24: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi10: +; NoVLX-NEXT: .Lcfi25: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi11: +; NoVLX-NEXT: .Lcfi26: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi27: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi28: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi29: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi30: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi31: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -351,64 +429,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -417,8 +495,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -443,12 +527,12 @@ ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi12: +; NoVLX-NEXT: .Lcfi32: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi13: +; NoVLX-NEXT: .Lcfi33: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi14: +; NoVLX-NEXT: .Lcfi34: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -457,20 +541,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi15: +; NoVLX-NEXT: .Lcfi35: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi16: +; NoVLX-NEXT: .Lcfi36: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi17: +; NoVLX-NEXT: .Lcfi37: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi18: +; NoVLX-NEXT: .Lcfi38: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi19: +; NoVLX-NEXT: .Lcfi39: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -513,11 +601,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -529,15 +617,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -553,6 +637,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -573,12 +658,12 @@ ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi20: +; NoVLX-NEXT: .Lcfi40: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi21: +; NoVLX-NEXT: .Lcfi41: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi22: +; NoVLX-NEXT: .Lcfi42: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -587,20 +672,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi23: +; NoVLX-NEXT: .Lcfi43: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi24: +; NoVLX-NEXT: .Lcfi44: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi25: +; NoVLX-NEXT: .Lcfi45: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi26: +; NoVLX-NEXT: .Lcfi46: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi27: +; NoVLX-NEXT: .Lcfi47: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -643,11 +732,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -659,15 +748,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -683,6 +768,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -705,12 +791,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi28: +; NoVLX-NEXT: .Lcfi48: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi29: +; NoVLX-NEXT: .Lcfi49: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi30: +; NoVLX-NEXT: .Lcfi50: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -719,21 +805,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi31: +; NoVLX-NEXT: .Lcfi51: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi32: +; NoVLX-NEXT: .Lcfi52: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi33: +; NoVLX-NEXT: .Lcfi53: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi34: +; NoVLX-NEXT: .Lcfi54: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi35: +; NoVLX-NEXT: .Lcfi55: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -776,11 +866,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -792,15 +882,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -816,6 +902,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -839,12 +926,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi36: +; NoVLX-NEXT: .Lcfi56: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi37: +; NoVLX-NEXT: .Lcfi57: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi38: +; NoVLX-NEXT: .Lcfi58: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -853,21 +940,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi39: +; NoVLX-NEXT: .Lcfi59: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi40: +; NoVLX-NEXT: .Lcfi60: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi41: +; NoVLX-NEXT: .Lcfi61: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi42: +; NoVLX-NEXT: .Lcfi62: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi43: +; NoVLX-NEXT: .Lcfi63: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -910,11 +1001,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -926,15 +1017,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -950,6 +1037,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -975,12 +1063,12 @@ ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi44: +; NoVLX-NEXT: .Lcfi64: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi45: +; NoVLX-NEXT: .Lcfi65: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi46: +; NoVLX-NEXT: .Lcfi66: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1003,6 +1091,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -1024,12 +1113,12 @@ ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi47: +; NoVLX-NEXT: .Lcfi67: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi48: +; NoVLX-NEXT: .Lcfi68: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi49: +; NoVLX-NEXT: .Lcfi69: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1052,6 +1141,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -1075,12 +1165,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi50: +; NoVLX-NEXT: .Lcfi70: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi51: +; NoVLX-NEXT: .Lcfi71: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi52: +; NoVLX-NEXT: .Lcfi72: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -1112,6 +1202,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -1136,12 +1227,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi53: +; NoVLX-NEXT: .Lcfi73: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi54: +; NoVLX-NEXT: .Lcfi74: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi55: +; NoVLX-NEXT: .Lcfi75: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -1173,6 +1264,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -1205,6 +1297,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1233,6 +1326,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1264,6 +1358,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1296,6 +1391,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1320,12 +1416,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi56: +; NoVLX-NEXT: .Lcfi76: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi57: +; NoVLX-NEXT: .Lcfi77: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi58: +; NoVLX-NEXT: .Lcfi78: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1375,6 +1471,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1395,12 +1492,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi59: +; NoVLX-NEXT: .Lcfi79: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi60: +; NoVLX-NEXT: .Lcfi80: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi61: +; NoVLX-NEXT: .Lcfi81: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1450,6 +1547,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1472,12 +1570,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi62: +; NoVLX-NEXT: .Lcfi82: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi63: +; NoVLX-NEXT: .Lcfi83: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi64: +; NoVLX-NEXT: .Lcfi84: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1528,6 +1626,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1551,12 +1650,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi65: +; NoVLX-NEXT: .Lcfi85: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi66: +; NoVLX-NEXT: .Lcfi86: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi67: +; NoVLX-NEXT: .Lcfi87: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1607,6 +1706,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1631,12 +1731,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi68: +; NoVLX-NEXT: .Lcfi88: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi69: +; NoVLX-NEXT: .Lcfi89: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi70: +; NoVLX-NEXT: .Lcfi90: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1644,43 +1744,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1691,6 +1791,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1711,12 +1812,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi71: +; NoVLX-NEXT: .Lcfi91: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi72: +; NoVLX-NEXT: .Lcfi92: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi73: +; NoVLX-NEXT: .Lcfi93: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1724,43 +1825,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1771,6 +1872,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1793,12 +1895,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi74: +; NoVLX-NEXT: .Lcfi94: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi75: +; NoVLX-NEXT: .Lcfi95: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi76: +; NoVLX-NEXT: .Lcfi96: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1807,43 +1909,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1854,6 +1956,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1877,12 +1980,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi77: +; NoVLX-NEXT: .Lcfi97: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi78: +; NoVLX-NEXT: .Lcfi98: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi79: +; NoVLX-NEXT: .Lcfi99: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1891,43 +1994,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1938,6 +2041,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -1963,15 +2067,30 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi80: +; NoVLX-NEXT: .Lcfi100: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi81: +; NoVLX-NEXT: .Lcfi101: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi82: +; NoVLX-NEXT: .Lcfi102: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi103: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi104: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi105: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi106: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi107: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -1980,64 +2099,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -2046,8 +2165,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -2069,15 +2194,30 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi83: +; NoVLX-NEXT: .Lcfi108: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi84: +; NoVLX-NEXT: .Lcfi109: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi85: +; NoVLX-NEXT: .Lcfi110: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi111: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi112: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi113: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi114: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi115: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -2086,64 +2226,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -2152,8 +2292,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -2177,15 +2323,30 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi86: +; NoVLX-NEXT: .Lcfi116: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi87: +; NoVLX-NEXT: .Lcfi117: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi88: +; NoVLX-NEXT: .Lcfi118: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi119: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi120: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi121: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi122: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi123: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -2195,64 +2356,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -2261,8 +2422,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -2287,15 +2454,30 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi89: +; NoVLX-NEXT: .Lcfi124: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi90: +; NoVLX-NEXT: .Lcfi125: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi91: +; NoVLX-NEXT: .Lcfi126: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi127: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi128: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi129: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi130: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi131: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -2305,64 +2487,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -2371,8 +2553,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -2398,12 +2586,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi92: +; NoVLX-NEXT: .Lcfi132: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi93: +; NoVLX-NEXT: .Lcfi133: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi94: +; NoVLX-NEXT: .Lcfi134: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2412,20 +2600,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi95: +; NoVLX-NEXT: .Lcfi135: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi96: +; NoVLX-NEXT: .Lcfi136: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi97: +; NoVLX-NEXT: .Lcfi137: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi98: +; NoVLX-NEXT: .Lcfi138: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi99: +; NoVLX-NEXT: .Lcfi139: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -2468,11 +2660,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -2484,15 +2676,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2508,6 +2696,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -2529,12 +2718,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi100: +; NoVLX-NEXT: .Lcfi140: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi101: +; NoVLX-NEXT: .Lcfi141: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi102: +; NoVLX-NEXT: .Lcfi142: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2543,20 +2732,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi103: +; NoVLX-NEXT: .Lcfi143: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi104: +; NoVLX-NEXT: .Lcfi144: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi105: +; NoVLX-NEXT: .Lcfi145: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi106: +; NoVLX-NEXT: .Lcfi146: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi107: +; NoVLX-NEXT: .Lcfi147: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -2599,11 +2792,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -2615,15 +2808,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2639,6 +2828,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -2662,12 +2852,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi108: +; NoVLX-NEXT: .Lcfi148: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi109: +; NoVLX-NEXT: .Lcfi149: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi110: +; NoVLX-NEXT: .Lcfi150: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2676,21 +2866,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi111: +; NoVLX-NEXT: .Lcfi151: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi112: +; NoVLX-NEXT: .Lcfi152: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi113: +; NoVLX-NEXT: .Lcfi153: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi114: +; NoVLX-NEXT: .Lcfi154: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi115: +; NoVLX-NEXT: .Lcfi155: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -2733,11 +2927,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -2749,15 +2943,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2773,6 +2963,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -2797,12 +2988,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi116: +; NoVLX-NEXT: .Lcfi156: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi117: +; NoVLX-NEXT: .Lcfi157: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi118: +; NoVLX-NEXT: .Lcfi158: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2811,21 +3002,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi119: +; NoVLX-NEXT: .Lcfi159: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi120: +; NoVLX-NEXT: .Lcfi160: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi121: +; NoVLX-NEXT: .Lcfi161: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi122: +; NoVLX-NEXT: .Lcfi162: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi123: +; NoVLX-NEXT: .Lcfi163: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -2868,11 +3063,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -2884,15 +3079,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2908,6 +3099,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -2933,58 +3125,62 @@ ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi124: +; NoVLX-NEXT: .Lcfi164: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi125: +; NoVLX-NEXT: .Lcfi165: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi126: +; NoVLX-NEXT: .Lcfi166: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm5 @@ -2992,194 +3188,190 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm6 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm8 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm0 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm1 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 @@ -3263,6 +3455,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -3284,68 +3477,69 @@ ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi127: +; NoVLX-NEXT: .Lcfi167: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi128: +; NoVLX-NEXT: .Lcfi168: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi129: +; NoVLX-NEXT: .Lcfi169: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm1 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 @@ -3353,8 +3547,7 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax @@ -3364,19 +3557,19 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm1 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 @@ -3529,6 +3722,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -3552,12 +3746,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi130: +; NoVLX-NEXT: .Lcfi170: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi131: +; NoVLX-NEXT: .Lcfi171: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi132: +; NoVLX-NEXT: .Lcfi172: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -3568,12 +3762,17 @@ ; NoVLX-NEXT: movq %rax, %rdx ; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 ; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -3581,10 +3780,9 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm4 @@ -3602,40 +3800,39 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 @@ -3643,72 +3840,69 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm5 -; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm6 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6 -; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6 -; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm7 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 -; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm6 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6 -; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm7 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm2 @@ -3717,13 +3911,18 @@ ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm4 ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -3732,152 +3931,147 @@ ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 -; NoVLX-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm2 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm1 -; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm4, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -3891,6 +4085,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -3915,12 +4110,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi133: +; NoVLX-NEXT: .Lcfi173: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi134: +; NoVLX-NEXT: .Lcfi174: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi135: +; NoVLX-NEXT: .Lcfi175: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -3932,6 +4127,8 @@ ; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax @@ -3944,20 +4141,19 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -3965,7 +4161,6 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 @@ -3988,174 +4183,174 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm4 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm3, %ymm3 -; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 -; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 -; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 -; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm2, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -4169,6 +4364,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -4210,8 +4406,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -4228,6 +4424,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4265,8 +4462,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -4283,6 +4480,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4305,6 +4503,7 @@ ; ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask: ; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -4316,14 +4515,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -4340,8 +4538,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -4358,6 +4556,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4417,8 +4616,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -4435,6 +4634,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4478,8 +4678,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -4496,6 +4696,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4520,6 +4721,7 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -4531,14 +4733,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -4555,8 +4756,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -4573,6 +4774,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4615,8 +4817,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -4633,6 +4835,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4669,8 +4872,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -4687,6 +4890,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4709,6 +4913,7 @@ ; ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask: ; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -4720,14 +4925,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -4743,8 +4947,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -4761,6 +4965,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4819,8 +5024,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -4837,6 +5042,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4879,8 +5085,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -4897,6 +5103,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4921,6 +5128,7 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -4932,14 +5140,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -4955,8 +5162,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -4973,6 +5180,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -4999,12 +5207,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi136: +; NoVLX-NEXT: .Lcfi176: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi137: +; NoVLX-NEXT: .Lcfi177: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi138: +; NoVLX-NEXT: .Lcfi178: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5022,6 +5230,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5042,12 +5251,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi139: +; NoVLX-NEXT: .Lcfi179: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi140: +; NoVLX-NEXT: .Lcfi180: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi141: +; NoVLX-NEXT: .Lcfi181: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5065,6 +5274,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5087,15 +5297,16 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi142: +; NoVLX-NEXT: .Lcfi182: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi143: +; NoVLX-NEXT: .Lcfi183: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi144: +; NoVLX-NEXT: .Lcfi184: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -5107,14 +5318,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -5128,6 +5338,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5152,12 +5363,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi145: +; NoVLX-NEXT: .Lcfi185: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi146: +; NoVLX-NEXT: .Lcfi186: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi147: +; NoVLX-NEXT: .Lcfi187: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5193,6 +5404,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5218,12 +5430,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi148: +; NoVLX-NEXT: .Lcfi188: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi149: +; NoVLX-NEXT: .Lcfi189: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi150: +; NoVLX-NEXT: .Lcfi190: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5242,6 +5454,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5265,16 +5478,17 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi151: +; NoVLX-NEXT: .Lcfi191: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi152: +; NoVLX-NEXT: .Lcfi192: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi153: +; NoVLX-NEXT: .Lcfi193: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -5286,14 +5500,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -5307,6 +5520,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5333,20 +5547,20 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi154: +; NoVLX-NEXT: .Lcfi194: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi155: +; NoVLX-NEXT: .Lcfi195: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi156: +; NoVLX-NEXT: .Lcfi196: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5362,6 +5576,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5382,20 +5597,20 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi157: +; NoVLX-NEXT: .Lcfi197: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi158: +; NoVLX-NEXT: .Lcfi198: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi159: +; NoVLX-NEXT: .Lcfi199: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5411,6 +5626,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5433,18 +5649,19 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi160: +; NoVLX-NEXT: .Lcfi200: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi161: +; NoVLX-NEXT: .Lcfi201: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi162: +; NoVLX-NEXT: .Lcfi202: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -5457,14 +5674,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5480,6 +5696,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5504,18 +5721,19 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi163: +; NoVLX-NEXT: .Lcfi203: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi164: +; NoVLX-NEXT: .Lcfi204: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi165: +; NoVLX-NEXT: .Lcfi205: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -5528,14 +5746,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5551,6 +5768,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5576,12 +5794,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi166: +; NoVLX-NEXT: .Lcfi206: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi167: +; NoVLX-NEXT: .Lcfi207: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi168: +; NoVLX-NEXT: .Lcfi208: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -5589,8 +5807,8 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5606,6 +5824,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5629,12 +5848,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi169: +; NoVLX-NEXT: .Lcfi209: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi170: +; NoVLX-NEXT: .Lcfi210: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi171: +; NoVLX-NEXT: .Lcfi211: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -5642,6 +5861,7 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -5654,14 +5874,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5677,6 +5896,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -5711,6 +5931,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -5739,6 +5960,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -5770,6 +5992,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -5802,6 +6025,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -5834,6 +6058,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -5866,6 +6091,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -5892,12 +6118,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi172: +; NoVLX-NEXT: .Lcfi212: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi173: +; NoVLX-NEXT: .Lcfi213: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi174: +; NoVLX-NEXT: .Lcfi214: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5946,6 +6172,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -5967,12 +6194,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi175: +; NoVLX-NEXT: .Lcfi215: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi176: +; NoVLX-NEXT: .Lcfi216: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi177: +; NoVLX-NEXT: .Lcfi217: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6021,6 +6248,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -6044,12 +6272,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi178: +; NoVLX-NEXT: .Lcfi218: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi179: +; NoVLX-NEXT: .Lcfi219: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi180: +; NoVLX-NEXT: .Lcfi220: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6100,6 +6328,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -6124,12 +6353,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi181: +; NoVLX-NEXT: .Lcfi221: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi182: +; NoVLX-NEXT: .Lcfi222: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi183: +; NoVLX-NEXT: .Lcfi223: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6180,6 +6409,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -6205,12 +6435,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi184: +; NoVLX-NEXT: .Lcfi224: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi185: +; NoVLX-NEXT: .Lcfi225: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi186: +; NoVLX-NEXT: .Lcfi226: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6259,6 +6489,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -6283,12 +6514,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi187: +; NoVLX-NEXT: .Lcfi227: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi188: +; NoVLX-NEXT: .Lcfi228: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi189: +; NoVLX-NEXT: .Lcfi229: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6339,6 +6570,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -6365,55 +6597,55 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi190: +; NoVLX-NEXT: .Lcfi230: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi191: +; NoVLX-NEXT: .Lcfi231: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi192: +; NoVLX-NEXT: .Lcfi232: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6424,6 +6656,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -6445,55 +6678,55 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi193: +; NoVLX-NEXT: .Lcfi233: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi194: +; NoVLX-NEXT: .Lcfi234: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi195: +; NoVLX-NEXT: .Lcfi235: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6504,6 +6737,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -6527,12 +6761,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi196: +; NoVLX-NEXT: .Lcfi236: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi197: +; NoVLX-NEXT: .Lcfi237: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi198: +; NoVLX-NEXT: .Lcfi238: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -6541,43 +6775,43 @@ ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6588,6 +6822,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -6612,12 +6847,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi199: +; NoVLX-NEXT: .Lcfi239: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi200: +; NoVLX-NEXT: .Lcfi240: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi201: +; NoVLX-NEXT: .Lcfi241: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -6626,43 +6861,43 @@ ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6673,6 +6908,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -6698,55 +6934,55 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi202: +; NoVLX-NEXT: .Lcfi242: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi203: +; NoVLX-NEXT: .Lcfi243: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi204: +; NoVLX-NEXT: .Lcfi244: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6757,6 +6993,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -6781,12 +7018,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi205: +; NoVLX-NEXT: .Lcfi245: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi206: +; NoVLX-NEXT: .Lcfi246: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi207: +; NoVLX-NEXT: .Lcfi247: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -6795,43 +7032,43 @@ ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6842,6 +7079,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -6868,78 +7106,93 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi208: +; NoVLX-NEXT: .Lcfi248: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi209: +; NoVLX-NEXT: .Lcfi249: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi210: +; NoVLX-NEXT: .Lcfi250: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi251: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi252: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi253: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi254: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi255: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -6948,8 +7201,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -6971,78 +7230,93 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi211: +; NoVLX-NEXT: .Lcfi256: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi212: +; NoVLX-NEXT: .Lcfi257: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi213: +; NoVLX-NEXT: .Lcfi258: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi259: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi260: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi261: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi262: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi263: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -7051,8 +7325,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -7076,79 +7356,94 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi214: +; NoVLX-NEXT: .Lcfi264: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi215: +; NoVLX-NEXT: .Lcfi265: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi216: +; NoVLX-NEXT: .Lcfi266: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi267: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi268: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi269: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi270: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi271: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -7157,8 +7452,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -7183,79 +7484,94 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi217: +; NoVLX-NEXT: .Lcfi272: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi218: +; NoVLX-NEXT: .Lcfi273: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi219: +; NoVLX-NEXT: .Lcfi274: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi275: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi276: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi277: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi278: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi279: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -7264,11 +7580,17 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <8 x i64> %__a to <16 x i32> +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b %1 = bitcast <8 x i64> %load to <16 x i32> %2 = icmp eq <16 x i32> %0, %1 @@ -7291,78 +7613,93 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi220: +; NoVLX-NEXT: .Lcfi280: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi221: +; NoVLX-NEXT: .Lcfi281: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi222: +; NoVLX-NEXT: .Lcfi282: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi283: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi284: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi285: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi286: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi287: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -7371,8 +7708,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -7397,79 +7740,94 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi223: +; NoVLX-NEXT: .Lcfi288: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi224: +; NoVLX-NEXT: .Lcfi289: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi225: +; NoVLX-NEXT: .Lcfi290: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi291: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi292: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi293: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi294: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi295: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -7478,8 +7836,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -7506,12 +7870,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi226: +; NoVLX-NEXT: .Lcfi296: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi227: +; NoVLX-NEXT: .Lcfi297: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi228: +; NoVLX-NEXT: .Lcfi298: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7520,17 +7884,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi229: +; NoVLX-NEXT: .Lcfi299: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi230: +; NoVLX-NEXT: .Lcfi300: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi231: +; NoVLX-NEXT: .Lcfi301: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi232: +; NoVLX-NEXT: .Lcfi302: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi233: +; NoVLX-NEXT: .Lcfi303: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -7573,11 +7941,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -7589,15 +7957,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -7613,6 +7977,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -7634,12 +7999,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi234: +; NoVLX-NEXT: .Lcfi304: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi235: +; NoVLX-NEXT: .Lcfi305: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi236: +; NoVLX-NEXT: .Lcfi306: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7648,17 +8013,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi237: +; NoVLX-NEXT: .Lcfi307: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi238: +; NoVLX-NEXT: .Lcfi308: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi239: +; NoVLX-NEXT: .Lcfi309: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi240: +; NoVLX-NEXT: .Lcfi310: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi241: +; NoVLX-NEXT: .Lcfi311: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -7701,11 +8070,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -7717,15 +8086,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -7741,6 +8106,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -7764,12 +8130,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi242: +; NoVLX-NEXT: .Lcfi312: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi243: +; NoVLX-NEXT: .Lcfi313: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi244: +; NoVLX-NEXT: .Lcfi314: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7778,18 +8144,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi245: +; NoVLX-NEXT: .Lcfi315: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi246: +; NoVLX-NEXT: .Lcfi316: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi247: +; NoVLX-NEXT: .Lcfi317: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi248: +; NoVLX-NEXT: .Lcfi318: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi249: +; NoVLX-NEXT: .Lcfi319: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -7832,11 +8202,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -7848,15 +8218,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -7872,6 +8238,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -7896,12 +8263,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi250: +; NoVLX-NEXT: .Lcfi320: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi251: +; NoVLX-NEXT: .Lcfi321: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi252: +; NoVLX-NEXT: .Lcfi322: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7910,18 +8277,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi253: +; NoVLX-NEXT: .Lcfi323: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi254: +; NoVLX-NEXT: .Lcfi324: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi255: +; NoVLX-NEXT: .Lcfi325: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi256: +; NoVLX-NEXT: .Lcfi326: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi257: +; NoVLX-NEXT: .Lcfi327: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -7964,11 +8335,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -7980,15 +8351,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -8004,6 +8371,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -8029,12 +8397,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi258: +; NoVLX-NEXT: .Lcfi328: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi259: +; NoVLX-NEXT: .Lcfi329: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi260: +; NoVLX-NEXT: .Lcfi330: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -8043,17 +8411,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi261: +; NoVLX-NEXT: .Lcfi331: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi262: +; NoVLX-NEXT: .Lcfi332: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi263: +; NoVLX-NEXT: .Lcfi333: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi264: +; NoVLX-NEXT: .Lcfi334: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi265: +; NoVLX-NEXT: .Lcfi335: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -8096,11 +8468,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -8112,15 +8484,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -8136,6 +8504,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -8160,12 +8529,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi266: +; NoVLX-NEXT: .Lcfi336: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi267: +; NoVLX-NEXT: .Lcfi337: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi268: +; NoVLX-NEXT: .Lcfi338: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -8174,18 +8543,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi269: +; NoVLX-NEXT: .Lcfi339: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi270: +; NoVLX-NEXT: .Lcfi340: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi271: +; NoVLX-NEXT: .Lcfi341: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi272: +; NoVLX-NEXT: .Lcfi342: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi273: +; NoVLX-NEXT: .Lcfi343: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -8228,11 +8601,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -8244,15 +8617,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -8268,6 +8637,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -8300,6 +8670,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8327,6 +8698,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8349,6 +8721,7 @@ ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: ; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -8356,16 +8729,16 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8407,6 +8780,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8440,6 +8814,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8464,6 +8839,7 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -8471,16 +8847,16 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8526,6 +8902,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8565,6 +8942,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8616,6 +8994,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8669,6 +9048,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8714,6 +9094,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8767,6 +9148,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8811,6 +9193,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8849,6 +9232,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8899,6 +9283,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8951,6 +9336,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -8995,6 +9381,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9047,6 +9434,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9073,12 +9461,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi274: +; NoVLX-NEXT: .Lcfi344: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi275: +; NoVLX-NEXT: .Lcfi345: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi276: +; NoVLX-NEXT: .Lcfi346: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9096,6 +9484,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9116,12 +9505,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi277: +; NoVLX-NEXT: .Lcfi347: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi278: +; NoVLX-NEXT: .Lcfi348: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi279: +; NoVLX-NEXT: .Lcfi349: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9139,6 +9528,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9161,15 +9551,16 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi280: +; NoVLX-NEXT: .Lcfi350: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi281: +; NoVLX-NEXT: .Lcfi351: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi282: +; NoVLX-NEXT: .Lcfi352: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -9177,10 +9568,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -9194,6 +9584,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9218,12 +9609,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi283: +; NoVLX-NEXT: .Lcfi353: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi284: +; NoVLX-NEXT: .Lcfi354: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi285: +; NoVLX-NEXT: .Lcfi355: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9251,6 +9642,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9276,12 +9668,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi286: +; NoVLX-NEXT: .Lcfi356: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi287: +; NoVLX-NEXT: .Lcfi357: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi288: +; NoVLX-NEXT: .Lcfi358: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9300,6 +9692,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9323,16 +9716,17 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi289: +; NoVLX-NEXT: .Lcfi359: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi290: +; NoVLX-NEXT: .Lcfi360: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi291: +; NoVLX-NEXT: .Lcfi361: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -9340,10 +9734,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -9357,6 +9750,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9383,20 +9777,20 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi292: +; NoVLX-NEXT: .Lcfi362: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi293: +; NoVLX-NEXT: .Lcfi363: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi294: +; NoVLX-NEXT: .Lcfi364: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9412,6 +9806,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9432,20 +9827,20 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi295: +; NoVLX-NEXT: .Lcfi365: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi296: +; NoVLX-NEXT: .Lcfi366: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi297: +; NoVLX-NEXT: .Lcfi367: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9461,6 +9856,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9483,12 +9879,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi298: +; NoVLX-NEXT: .Lcfi368: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi299: +; NoVLX-NEXT: .Lcfi369: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi300: +; NoVLX-NEXT: .Lcfi370: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9505,8 +9901,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9522,6 +9918,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9546,12 +9943,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi301: +; NoVLX-NEXT: .Lcfi371: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi302: +; NoVLX-NEXT: .Lcfi372: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi303: +; NoVLX-NEXT: .Lcfi373: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9568,8 +9965,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9585,6 +9982,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9610,12 +10008,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi304: +; NoVLX-NEXT: .Lcfi374: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi305: +; NoVLX-NEXT: .Lcfi375: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi306: +; NoVLX-NEXT: .Lcfi376: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9623,8 +10021,8 @@ ; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9640,6 +10038,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9663,12 +10062,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi307: +; NoVLX-NEXT: .Lcfi377: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi308: +; NoVLX-NEXT: .Lcfi378: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi309: +; NoVLX-NEXT: .Lcfi379: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9686,8 +10085,8 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9703,6 +10102,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -9748,8 +10148,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -9766,6 +10166,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -9805,8 +10206,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -9823,6 +10224,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -9847,6 +10249,7 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -9864,7 +10267,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -9882,8 +10284,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -9900,6 +10302,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -9926,6 +10329,7 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -9943,7 +10347,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -9961,8 +10364,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -9979,6 +10382,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10024,8 +10428,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -10042,6 +10446,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10068,6 +10473,7 @@ ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -10085,7 +10491,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -10103,8 +10508,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -10121,6 +10526,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10165,8 +10571,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -10183,6 +10589,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10221,8 +10628,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -10239,6 +10646,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10263,6 +10671,7 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -10280,7 +10689,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -10297,8 +10705,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -10315,6 +10723,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10341,6 +10750,7 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -10358,7 +10768,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -10375,8 +10784,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -10393,6 +10802,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10437,8 +10847,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -10455,6 +10865,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10481,6 +10892,7 @@ ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -10498,7 +10910,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -10515,8 +10926,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -10533,6 +10944,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10560,12 +10972,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi310: +; NoVLX-NEXT: .Lcfi380: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi311: +; NoVLX-NEXT: .Lcfi381: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi312: +; NoVLX-NEXT: .Lcfi382: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -10584,6 +10996,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10605,12 +11018,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi313: +; NoVLX-NEXT: .Lcfi383: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi314: +; NoVLX-NEXT: .Lcfi384: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi315: +; NoVLX-NEXT: .Lcfi385: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -10629,6 +11042,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10652,16 +11066,17 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi316: +; NoVLX-NEXT: .Lcfi386: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi317: +; NoVLX-NEXT: .Lcfi387: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi318: +; NoVLX-NEXT: .Lcfi388: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -10679,7 +11094,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -10694,6 +11108,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10719,16 +11134,17 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi319: +; NoVLX-NEXT: .Lcfi389: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi320: +; NoVLX-NEXT: .Lcfi390: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi321: +; NoVLX-NEXT: .Lcfi391: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -10746,7 +11162,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -10761,6 +11176,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10787,12 +11203,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi322: +; NoVLX-NEXT: .Lcfi392: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi323: +; NoVLX-NEXT: .Lcfi393: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi324: +; NoVLX-NEXT: .Lcfi394: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -10812,6 +11228,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10836,17 +11253,18 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi325: +; NoVLX-NEXT: .Lcfi395: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi326: +; NoVLX-NEXT: .Lcfi396: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi327: +; NoVLX-NEXT: .Lcfi397: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -10864,7 +11282,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -10879,6 +11296,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10906,12 +11324,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi328: +; NoVLX-NEXT: .Lcfi398: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi329: +; NoVLX-NEXT: .Lcfi399: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi330: +; NoVLX-NEXT: .Lcfi400: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -10919,8 +11337,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -10936,6 +11354,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -10957,12 +11376,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi331: +; NoVLX-NEXT: .Lcfi401: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi332: +; NoVLX-NEXT: .Lcfi402: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi333: +; NoVLX-NEXT: .Lcfi403: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -10970,8 +11389,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -10987,6 +11406,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -11010,12 +11430,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi334: +; NoVLX-NEXT: .Lcfi404: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi335: +; NoVLX-NEXT: .Lcfi405: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi336: +; NoVLX-NEXT: .Lcfi406: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -11023,6 +11443,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -11035,14 +11456,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -11058,6 +11478,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -11083,12 +11504,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi337: +; NoVLX-NEXT: .Lcfi407: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi338: +; NoVLX-NEXT: .Lcfi408: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi339: +; NoVLX-NEXT: .Lcfi409: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -11096,6 +11517,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -11108,14 +11530,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -11131,6 +11552,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -11157,12 +11579,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi340: +; NoVLX-NEXT: .Lcfi410: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi341: +; NoVLX-NEXT: .Lcfi411: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi342: +; NoVLX-NEXT: .Lcfi412: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -11171,8 +11593,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -11188,6 +11610,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -11212,12 +11635,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi343: +; NoVLX-NEXT: .Lcfi413: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi344: +; NoVLX-NEXT: .Lcfi414: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi345: +; NoVLX-NEXT: .Lcfi415: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -11226,6 +11649,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -11238,14 +11662,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -11261,6 +11684,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -11291,6 +11715,7 @@ ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11315,6 +11740,7 @@ ; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11342,6 +11768,7 @@ ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11370,6 +11797,7 @@ ; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11398,6 +11826,7 @@ ; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11426,6 +11855,7 @@ ; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11452,12 +11882,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi346: +; NoVLX-NEXT: .Lcfi416: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi347: +; NoVLX-NEXT: .Lcfi417: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi348: +; NoVLX-NEXT: .Lcfi418: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11504,6 +11934,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11525,12 +11956,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi349: +; NoVLX-NEXT: .Lcfi419: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi350: +; NoVLX-NEXT: .Lcfi420: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi351: +; NoVLX-NEXT: .Lcfi421: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11577,6 +12008,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11600,12 +12032,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi352: +; NoVLX-NEXT: .Lcfi422: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi353: +; NoVLX-NEXT: .Lcfi423: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi354: +; NoVLX-NEXT: .Lcfi424: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11653,6 +12085,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11677,12 +12110,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi355: +; NoVLX-NEXT: .Lcfi425: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi356: +; NoVLX-NEXT: .Lcfi426: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi357: +; NoVLX-NEXT: .Lcfi427: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11730,6 +12163,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11755,12 +12189,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi358: +; NoVLX-NEXT: .Lcfi428: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi359: +; NoVLX-NEXT: .Lcfi429: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi360: +; NoVLX-NEXT: .Lcfi430: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11807,6 +12241,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11831,12 +12266,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi361: +; NoVLX-NEXT: .Lcfi431: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi362: +; NoVLX-NEXT: .Lcfi432: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi363: +; NoVLX-NEXT: .Lcfi433: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11884,6 +12319,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11910,53 +12346,53 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi364: +; NoVLX-NEXT: .Lcfi434: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi365: +; NoVLX-NEXT: .Lcfi435: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi366: +; NoVLX-NEXT: .Lcfi436: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -11967,6 +12403,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -11988,53 +12425,53 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi367: +; NoVLX-NEXT: .Lcfi437: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi368: +; NoVLX-NEXT: .Lcfi438: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi369: +; NoVLX-NEXT: .Lcfi439: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12045,6 +12482,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -12068,54 +12506,54 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi370: +; NoVLX-NEXT: .Lcfi440: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi371: +; NoVLX-NEXT: .Lcfi441: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi372: +; NoVLX-NEXT: .Lcfi442: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12126,6 +12564,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -12150,54 +12589,54 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi373: +; NoVLX-NEXT: .Lcfi443: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi374: +; NoVLX-NEXT: .Lcfi444: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi375: +; NoVLX-NEXT: .Lcfi445: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12208,6 +12647,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -12233,53 +12673,53 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi376: +; NoVLX-NEXT: .Lcfi446: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi377: +; NoVLX-NEXT: .Lcfi447: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi378: +; NoVLX-NEXT: .Lcfi448: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12290,6 +12730,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -12314,54 +12755,54 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi379: +; NoVLX-NEXT: .Lcfi449: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi380: +; NoVLX-NEXT: .Lcfi450: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi381: +; NoVLX-NEXT: .Lcfi451: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12372,6 +12813,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -12397,15 +12839,30 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi382: +; NoVLX-NEXT: .Lcfi452: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi383: +; NoVLX-NEXT: .Lcfi453: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi384: +; NoVLX-NEXT: .Lcfi454: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi455: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi456: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi457: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi458: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi459: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -12414,64 +12871,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -12480,8 +12937,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -12502,15 +12965,30 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi385: +; NoVLX-NEXT: .Lcfi460: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi386: +; NoVLX-NEXT: .Lcfi461: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi387: +; NoVLX-NEXT: .Lcfi462: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi463: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi464: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi465: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi466: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi467: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -12519,64 +12997,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -12585,8 +13063,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -12609,15 +13093,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi388: +; NoVLX-NEXT: .Lcfi468: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi389: +; NoVLX-NEXT: .Lcfi469: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi390: +; NoVLX-NEXT: .Lcfi470: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi471: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi472: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi473: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi474: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi475: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -12627,64 +13126,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -12693,8 +13192,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -12718,15 +13223,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi391: +; NoVLX-NEXT: .Lcfi476: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi392: +; NoVLX-NEXT: .Lcfi477: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi393: +; NoVLX-NEXT: .Lcfi478: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi479: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi480: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi481: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi482: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi483: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -12736,64 +13256,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -12802,8 +13322,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -12828,12 +13354,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi394: +; NoVLX-NEXT: .Lcfi484: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi395: +; NoVLX-NEXT: .Lcfi485: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi396: +; NoVLX-NEXT: .Lcfi486: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -12842,20 +13368,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi397: +; NoVLX-NEXT: .Lcfi487: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi398: +; NoVLX-NEXT: .Lcfi488: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi399: +; NoVLX-NEXT: .Lcfi489: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi400: +; NoVLX-NEXT: .Lcfi490: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi401: +; NoVLX-NEXT: .Lcfi491: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -12898,11 +13428,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -12914,15 +13444,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12938,6 +13464,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -12958,12 +13485,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi402: +; NoVLX-NEXT: .Lcfi492: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi403: +; NoVLX-NEXT: .Lcfi493: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi404: +; NoVLX-NEXT: .Lcfi494: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -12972,20 +13499,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi405: +; NoVLX-NEXT: .Lcfi495: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi406: +; NoVLX-NEXT: .Lcfi496: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi407: +; NoVLX-NEXT: .Lcfi497: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi408: +; NoVLX-NEXT: .Lcfi498: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi409: +; NoVLX-NEXT: .Lcfi499: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -13028,11 +13559,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -13044,15 +13575,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -13068,6 +13595,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -13090,12 +13618,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi410: +; NoVLX-NEXT: .Lcfi500: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi411: +; NoVLX-NEXT: .Lcfi501: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi412: +; NoVLX-NEXT: .Lcfi502: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -13104,21 +13632,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi413: +; NoVLX-NEXT: .Lcfi503: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi414: +; NoVLX-NEXT: .Lcfi504: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi415: +; NoVLX-NEXT: .Lcfi505: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi416: +; NoVLX-NEXT: .Lcfi506: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi417: +; NoVLX-NEXT: .Lcfi507: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -13161,11 +13693,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -13177,15 +13709,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -13201,6 +13729,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -13224,12 +13753,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi418: +; NoVLX-NEXT: .Lcfi508: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi419: +; NoVLX-NEXT: .Lcfi509: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi420: +; NoVLX-NEXT: .Lcfi510: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -13238,21 +13767,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi421: +; NoVLX-NEXT: .Lcfi511: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi422: +; NoVLX-NEXT: .Lcfi512: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi423: +; NoVLX-NEXT: .Lcfi513: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi424: +; NoVLX-NEXT: .Lcfi514: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi425: +; NoVLX-NEXT: .Lcfi515: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -13295,11 +13828,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -13311,15 +13844,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -13335,6 +13864,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -13360,12 +13890,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi426: +; NoVLX-NEXT: .Lcfi516: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi427: +; NoVLX-NEXT: .Lcfi517: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi428: +; NoVLX-NEXT: .Lcfi518: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -13388,6 +13918,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -13409,12 +13940,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi429: +; NoVLX-NEXT: .Lcfi519: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi430: +; NoVLX-NEXT: .Lcfi520: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi431: +; NoVLX-NEXT: .Lcfi521: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -13437,6 +13968,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -13460,12 +13992,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi432: +; NoVLX-NEXT: .Lcfi522: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi433: +; NoVLX-NEXT: .Lcfi523: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi434: +; NoVLX-NEXT: .Lcfi524: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -13497,6 +14029,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -13521,12 +14054,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi435: +; NoVLX-NEXT: .Lcfi525: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi436: +; NoVLX-NEXT: .Lcfi526: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi437: +; NoVLX-NEXT: .Lcfi527: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -13558,6 +14091,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -13590,6 +14124,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -13618,6 +14153,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -13649,6 +14185,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -13681,6 +14218,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -13705,12 +14243,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi438: +; NoVLX-NEXT: .Lcfi528: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi439: +; NoVLX-NEXT: .Lcfi529: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi440: +; NoVLX-NEXT: .Lcfi530: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -13760,6 +14298,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -13780,12 +14319,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi441: +; NoVLX-NEXT: .Lcfi531: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi442: +; NoVLX-NEXT: .Lcfi532: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi443: +; NoVLX-NEXT: .Lcfi533: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -13835,6 +14374,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -13857,12 +14397,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi444: +; NoVLX-NEXT: .Lcfi534: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi445: +; NoVLX-NEXT: .Lcfi535: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi446: +; NoVLX-NEXT: .Lcfi536: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -13913,6 +14453,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -13936,12 +14477,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi447: +; NoVLX-NEXT: .Lcfi537: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi448: +; NoVLX-NEXT: .Lcfi538: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi449: +; NoVLX-NEXT: .Lcfi539: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -13992,6 +14533,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -14016,12 +14558,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi450: +; NoVLX-NEXT: .Lcfi540: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi451: +; NoVLX-NEXT: .Lcfi541: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi452: +; NoVLX-NEXT: .Lcfi542: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14029,43 +14571,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -14076,6 +14618,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -14096,12 +14639,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi453: +; NoVLX-NEXT: .Lcfi543: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi454: +; NoVLX-NEXT: .Lcfi544: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi455: +; NoVLX-NEXT: .Lcfi545: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14109,43 +14652,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -14156,6 +14699,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -14178,12 +14722,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi456: +; NoVLX-NEXT: .Lcfi546: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi457: +; NoVLX-NEXT: .Lcfi547: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi458: +; NoVLX-NEXT: .Lcfi548: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14192,43 +14736,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -14239,6 +14783,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -14262,12 +14807,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi459: +; NoVLX-NEXT: .Lcfi549: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi460: +; NoVLX-NEXT: .Lcfi550: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi461: +; NoVLX-NEXT: .Lcfi551: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14276,43 +14821,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -14323,6 +14868,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -14348,15 +14894,30 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi462: +; NoVLX-NEXT: .Lcfi552: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi463: +; NoVLX-NEXT: .Lcfi553: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi464: +; NoVLX-NEXT: .Lcfi554: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi555: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi556: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi557: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi558: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi559: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14365,64 +14926,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -14431,8 +14992,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -14454,15 +15021,30 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi465: +; NoVLX-NEXT: .Lcfi560: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi466: +; NoVLX-NEXT: .Lcfi561: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi467: +; NoVLX-NEXT: .Lcfi562: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi563: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi564: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi565: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi566: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi567: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14471,64 +15053,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -14537,8 +15119,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -14562,15 +15150,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi468: +; NoVLX-NEXT: .Lcfi568: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi469: +; NoVLX-NEXT: .Lcfi569: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi470: +; NoVLX-NEXT: .Lcfi570: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi571: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi572: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi573: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi574: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi575: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14580,64 +15183,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -14646,8 +15249,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -14672,15 +15281,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi471: +; NoVLX-NEXT: .Lcfi576: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi472: +; NoVLX-NEXT: .Lcfi577: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi473: +; NoVLX-NEXT: .Lcfi578: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi579: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi580: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi581: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi582: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi583: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14690,64 +15314,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -14756,8 +15380,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -14783,12 +15413,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi474: +; NoVLX-NEXT: .Lcfi584: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi475: +; NoVLX-NEXT: .Lcfi585: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi476: +; NoVLX-NEXT: .Lcfi586: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -14797,20 +15427,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi477: +; NoVLX-NEXT: .Lcfi587: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi478: +; NoVLX-NEXT: .Lcfi588: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi479: +; NoVLX-NEXT: .Lcfi589: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi480: +; NoVLX-NEXT: .Lcfi590: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi481: +; NoVLX-NEXT: .Lcfi591: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -14853,11 +15487,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -14869,15 +15503,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -14893,6 +15523,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -14914,12 +15545,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi482: +; NoVLX-NEXT: .Lcfi592: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi483: +; NoVLX-NEXT: .Lcfi593: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi484: +; NoVLX-NEXT: .Lcfi594: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -14928,20 +15559,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi485: +; NoVLX-NEXT: .Lcfi595: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi486: +; NoVLX-NEXT: .Lcfi596: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi487: +; NoVLX-NEXT: .Lcfi597: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi488: +; NoVLX-NEXT: .Lcfi598: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi489: +; NoVLX-NEXT: .Lcfi599: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -14984,11 +15619,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -15000,15 +15635,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -15024,6 +15655,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -15047,12 +15679,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi490: +; NoVLX-NEXT: .Lcfi600: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi491: +; NoVLX-NEXT: .Lcfi601: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi492: +; NoVLX-NEXT: .Lcfi602: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -15061,21 +15693,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi493: +; NoVLX-NEXT: .Lcfi603: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi494: +; NoVLX-NEXT: .Lcfi604: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi495: +; NoVLX-NEXT: .Lcfi605: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi496: +; NoVLX-NEXT: .Lcfi606: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi497: +; NoVLX-NEXT: .Lcfi607: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -15118,11 +15754,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -15134,15 +15770,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -15158,6 +15790,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -15182,12 +15815,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi498: +; NoVLX-NEXT: .Lcfi608: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi499: +; NoVLX-NEXT: .Lcfi609: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi500: +; NoVLX-NEXT: .Lcfi610: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -15196,21 +15829,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi501: +; NoVLX-NEXT: .Lcfi611: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi502: +; NoVLX-NEXT: .Lcfi612: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi503: +; NoVLX-NEXT: .Lcfi613: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi504: +; NoVLX-NEXT: .Lcfi614: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi505: +; NoVLX-NEXT: .Lcfi615: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -15253,11 +15890,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -15269,15 +15906,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -15293,6 +15926,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -15318,58 +15952,62 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi506: +; NoVLX-NEXT: .Lcfi616: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi507: +; NoVLX-NEXT: .Lcfi617: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi508: +; NoVLX-NEXT: .Lcfi618: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm5 @@ -15377,194 +16015,190 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm6 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm8 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm0 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm1 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 @@ -15648,6 +16282,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -15669,68 +16304,69 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi509: +; NoVLX-NEXT: .Lcfi619: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi510: +; NoVLX-NEXT: .Lcfi620: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi511: +; NoVLX-NEXT: .Lcfi621: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm1 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 @@ -15738,8 +16374,7 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax @@ -15749,19 +16384,19 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm1 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 @@ -15914,6 +16549,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -15937,12 +16573,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi512: +; NoVLX-NEXT: .Lcfi622: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi513: +; NoVLX-NEXT: .Lcfi623: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi514: +; NoVLX-NEXT: .Lcfi624: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -15953,12 +16589,17 @@ ; NoVLX-NEXT: movq %rax, %rdx ; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 ; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -15966,10 +16607,9 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm4 @@ -15987,40 +16627,39 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 @@ -16028,72 +16667,69 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm5 -; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm6 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6 -; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6 -; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm7 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 -; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm6 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6 -; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm7 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm2 @@ -16102,13 +16738,18 @@ ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm4 ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -16117,152 +16758,147 @@ ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 -; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm1, %ymm2 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -16276,6 +16912,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -16300,12 +16937,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi515: +; NoVLX-NEXT: .Lcfi625: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi516: +; NoVLX-NEXT: .Lcfi626: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi517: +; NoVLX-NEXT: .Lcfi627: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -16317,6 +16954,8 @@ ; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax @@ -16329,20 +16968,19 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -16350,7 +16988,6 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 @@ -16373,174 +17010,174 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm4 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm3, %ymm3 -; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 -; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 -; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 -; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm2, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -16554,6 +17191,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -16595,8 +17233,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -16613,6 +17251,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -16650,8 +17289,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -16668,6 +17307,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -16690,6 +17330,7 @@ ; ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask: ; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -16701,14 +17342,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -16725,8 +17365,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -16743,6 +17383,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -16802,8 +17443,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -16820,6 +17461,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -16863,8 +17505,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -16881,6 +17523,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -16905,6 +17548,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -16916,14 +17560,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -16940,8 +17583,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -16958,6 +17601,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17000,8 +17644,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -17018,6 +17662,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17054,8 +17699,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -17072,6 +17717,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17094,6 +17740,7 @@ ; ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask: ; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -17105,14 +17752,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -17128,8 +17774,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -17146,6 +17792,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17204,8 +17851,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -17222,6 +17869,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17264,8 +17912,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -17282,6 +17930,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17306,6 +17955,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -17317,14 +17967,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -17340,8 +17989,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -17358,6 +18007,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17384,12 +18034,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi518: +; NoVLX-NEXT: .Lcfi628: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi519: +; NoVLX-NEXT: .Lcfi629: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi520: +; NoVLX-NEXT: .Lcfi630: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -17407,6 +18057,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17427,12 +18078,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi521: +; NoVLX-NEXT: .Lcfi631: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi522: +; NoVLX-NEXT: .Lcfi632: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi523: +; NoVLX-NEXT: .Lcfi633: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -17450,6 +18101,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17472,15 +18124,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi524: +; NoVLX-NEXT: .Lcfi634: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi525: +; NoVLX-NEXT: .Lcfi635: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi526: +; NoVLX-NEXT: .Lcfi636: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -17492,14 +18145,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -17513,6 +18165,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17537,12 +18190,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi527: +; NoVLX-NEXT: .Lcfi637: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi528: +; NoVLX-NEXT: .Lcfi638: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi529: +; NoVLX-NEXT: .Lcfi639: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -17578,6 +18231,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17603,12 +18257,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi530: +; NoVLX-NEXT: .Lcfi640: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi531: +; NoVLX-NEXT: .Lcfi641: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi532: +; NoVLX-NEXT: .Lcfi642: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -17627,6 +18281,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17650,16 +18305,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi533: +; NoVLX-NEXT: .Lcfi643: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi534: +; NoVLX-NEXT: .Lcfi644: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi535: +; NoVLX-NEXT: .Lcfi645: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -17671,14 +18327,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -17692,6 +18347,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17718,20 +18374,20 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi536: +; NoVLX-NEXT: .Lcfi646: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi537: +; NoVLX-NEXT: .Lcfi647: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi538: +; NoVLX-NEXT: .Lcfi648: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -17747,6 +18403,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17767,20 +18424,20 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi539: +; NoVLX-NEXT: .Lcfi649: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi540: +; NoVLX-NEXT: .Lcfi650: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi541: +; NoVLX-NEXT: .Lcfi651: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -17796,6 +18453,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17818,18 +18476,19 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi542: +; NoVLX-NEXT: .Lcfi652: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi543: +; NoVLX-NEXT: .Lcfi653: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi544: +; NoVLX-NEXT: .Lcfi654: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -17842,14 +18501,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -17865,6 +18523,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17889,18 +18548,19 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi545: +; NoVLX-NEXT: .Lcfi655: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi546: +; NoVLX-NEXT: .Lcfi656: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi547: +; NoVLX-NEXT: .Lcfi657: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -17913,14 +18573,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -17936,6 +18595,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -17961,12 +18621,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi548: +; NoVLX-NEXT: .Lcfi658: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi549: +; NoVLX-NEXT: .Lcfi659: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi550: +; NoVLX-NEXT: .Lcfi660: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -17974,8 +18634,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -17991,6 +18651,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -18014,12 +18675,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi551: +; NoVLX-NEXT: .Lcfi661: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi552: +; NoVLX-NEXT: .Lcfi662: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi553: +; NoVLX-NEXT: .Lcfi663: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -18027,6 +18688,7 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -18039,14 +18701,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -18062,6 +18723,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -18096,6 +18758,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18124,6 +18787,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18155,6 +18819,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18187,6 +18852,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18219,6 +18885,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18251,6 +18918,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18277,12 +18945,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi554: +; NoVLX-NEXT: .Lcfi664: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi555: +; NoVLX-NEXT: .Lcfi665: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi556: +; NoVLX-NEXT: .Lcfi666: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18331,6 +18999,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18352,12 +19021,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi557: +; NoVLX-NEXT: .Lcfi667: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi558: +; NoVLX-NEXT: .Lcfi668: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi559: +; NoVLX-NEXT: .Lcfi669: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18406,6 +19075,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18429,12 +19099,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi560: +; NoVLX-NEXT: .Lcfi670: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi561: +; NoVLX-NEXT: .Lcfi671: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi562: +; NoVLX-NEXT: .Lcfi672: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18485,6 +19155,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18509,12 +19180,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi563: +; NoVLX-NEXT: .Lcfi673: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi564: +; NoVLX-NEXT: .Lcfi674: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi565: +; NoVLX-NEXT: .Lcfi675: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18565,6 +19236,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18590,12 +19262,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi566: +; NoVLX-NEXT: .Lcfi676: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi567: +; NoVLX-NEXT: .Lcfi677: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi568: +; NoVLX-NEXT: .Lcfi678: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18644,6 +19316,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18668,12 +19341,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi569: +; NoVLX-NEXT: .Lcfi679: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi570: +; NoVLX-NEXT: .Lcfi680: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi571: +; NoVLX-NEXT: .Lcfi681: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18724,6 +19397,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18750,55 +19424,55 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi572: +; NoVLX-NEXT: .Lcfi682: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi573: +; NoVLX-NEXT: .Lcfi683: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi574: +; NoVLX-NEXT: .Lcfi684: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -18809,6 +19483,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18830,55 +19505,55 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi575: +; NoVLX-NEXT: .Lcfi685: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi576: +; NoVLX-NEXT: .Lcfi686: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi577: +; NoVLX-NEXT: .Lcfi687: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -18889,6 +19564,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18912,12 +19588,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi578: +; NoVLX-NEXT: .Lcfi688: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi579: +; NoVLX-NEXT: .Lcfi689: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi580: +; NoVLX-NEXT: .Lcfi690: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -18926,43 +19602,43 @@ ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -18973,6 +19649,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -18997,12 +19674,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi581: +; NoVLX-NEXT: .Lcfi691: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi582: +; NoVLX-NEXT: .Lcfi692: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi583: +; NoVLX-NEXT: .Lcfi693: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -19011,43 +19688,43 @@ ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -19058,6 +19735,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -19083,55 +19761,55 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi584: +; NoVLX-NEXT: .Lcfi694: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi585: +; NoVLX-NEXT: .Lcfi695: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi586: +; NoVLX-NEXT: .Lcfi696: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -19142,6 +19820,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -19166,12 +19845,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi587: +; NoVLX-NEXT: .Lcfi697: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi588: +; NoVLX-NEXT: .Lcfi698: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi589: +; NoVLX-NEXT: .Lcfi699: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -19180,43 +19859,43 @@ ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -19227,6 +19906,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -19253,78 +19933,93 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi590: +; NoVLX-NEXT: .Lcfi700: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi591: +; NoVLX-NEXT: .Lcfi701: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi592: +; NoVLX-NEXT: .Lcfi702: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi703: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi704: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi705: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi706: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi707: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -19333,8 +20028,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -19356,78 +20057,93 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi593: +; NoVLX-NEXT: .Lcfi708: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi594: +; NoVLX-NEXT: .Lcfi709: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi595: +; NoVLX-NEXT: .Lcfi710: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi711: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi712: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi713: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi714: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi715: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -19436,8 +20152,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -19461,79 +20183,94 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi596: +; NoVLX-NEXT: .Lcfi716: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi597: +; NoVLX-NEXT: .Lcfi717: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi598: +; NoVLX-NEXT: .Lcfi718: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi719: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi720: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi721: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi722: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi723: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -19542,8 +20279,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -19568,79 +20311,94 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi599: +; NoVLX-NEXT: .Lcfi724: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi600: +; NoVLX-NEXT: .Lcfi725: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi601: +; NoVLX-NEXT: .Lcfi726: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi727: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi728: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi729: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi730: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi731: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -19649,11 +20407,17 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <8 x i64> %__a to <16 x i32> +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b %1 = bitcast <8 x i64> %load to <16 x i32> %2 = icmp sgt <16 x i32> %0, %1 @@ -19676,78 +20440,93 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi602: +; NoVLX-NEXT: .Lcfi732: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi603: +; NoVLX-NEXT: .Lcfi733: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi604: +; NoVLX-NEXT: .Lcfi734: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi735: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi736: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi737: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi738: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi739: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -19756,8 +20535,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -19782,79 +20567,94 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi605: +; NoVLX-NEXT: .Lcfi740: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi606: +; NoVLX-NEXT: .Lcfi741: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi607: +; NoVLX-NEXT: .Lcfi742: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi743: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi744: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi745: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi746: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi747: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -19863,8 +20663,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -19891,12 +20697,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi608: +; NoVLX-NEXT: .Lcfi748: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi609: +; NoVLX-NEXT: .Lcfi749: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi610: +; NoVLX-NEXT: .Lcfi750: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -19905,17 +20711,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi611: +; NoVLX-NEXT: .Lcfi751: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi612: +; NoVLX-NEXT: .Lcfi752: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi613: +; NoVLX-NEXT: .Lcfi753: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi614: +; NoVLX-NEXT: .Lcfi754: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi615: +; NoVLX-NEXT: .Lcfi755: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -19958,11 +20768,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -19974,15 +20784,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -19998,6 +20804,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -20019,12 +20826,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi616: +; NoVLX-NEXT: .Lcfi756: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi617: +; NoVLX-NEXT: .Lcfi757: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi618: +; NoVLX-NEXT: .Lcfi758: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20033,17 +20840,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi619: +; NoVLX-NEXT: .Lcfi759: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi620: +; NoVLX-NEXT: .Lcfi760: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi621: +; NoVLX-NEXT: .Lcfi761: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi622: +; NoVLX-NEXT: .Lcfi762: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi623: +; NoVLX-NEXT: .Lcfi763: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -20086,11 +20897,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -20102,15 +20913,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -20126,6 +20933,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -20149,12 +20957,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi624: +; NoVLX-NEXT: .Lcfi764: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi625: +; NoVLX-NEXT: .Lcfi765: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi626: +; NoVLX-NEXT: .Lcfi766: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20163,18 +20971,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi627: +; NoVLX-NEXT: .Lcfi767: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi628: +; NoVLX-NEXT: .Lcfi768: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi629: +; NoVLX-NEXT: .Lcfi769: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi630: +; NoVLX-NEXT: .Lcfi770: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi631: +; NoVLX-NEXT: .Lcfi771: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -20217,11 +21029,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -20233,15 +21045,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -20257,6 +21065,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -20281,12 +21090,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi632: +; NoVLX-NEXT: .Lcfi772: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi633: +; NoVLX-NEXT: .Lcfi773: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi634: +; NoVLX-NEXT: .Lcfi774: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20295,18 +21104,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi635: +; NoVLX-NEXT: .Lcfi775: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi636: +; NoVLX-NEXT: .Lcfi776: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi637: +; NoVLX-NEXT: .Lcfi777: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi638: +; NoVLX-NEXT: .Lcfi778: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi639: +; NoVLX-NEXT: .Lcfi779: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -20349,11 +21162,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -20365,15 +21178,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -20389,6 +21198,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -20414,12 +21224,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi640: +; NoVLX-NEXT: .Lcfi780: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi641: +; NoVLX-NEXT: .Lcfi781: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi642: +; NoVLX-NEXT: .Lcfi782: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20428,17 +21238,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi643: +; NoVLX-NEXT: .Lcfi783: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi644: +; NoVLX-NEXT: .Lcfi784: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi645: +; NoVLX-NEXT: .Lcfi785: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi646: +; NoVLX-NEXT: .Lcfi786: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi647: +; NoVLX-NEXT: .Lcfi787: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -20481,11 +21295,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -20497,15 +21311,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -20521,6 +21331,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -20545,12 +21356,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi648: +; NoVLX-NEXT: .Lcfi788: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi649: +; NoVLX-NEXT: .Lcfi789: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi650: +; NoVLX-NEXT: .Lcfi790: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20559,18 +21370,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi651: +; NoVLX-NEXT: .Lcfi791: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi652: +; NoVLX-NEXT: .Lcfi792: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi653: +; NoVLX-NEXT: .Lcfi793: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi654: +; NoVLX-NEXT: .Lcfi794: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi655: +; NoVLX-NEXT: .Lcfi795: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -20613,11 +21428,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -20629,15 +21444,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -20653,6 +21464,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -20685,6 +21497,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -20712,6 +21525,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -20734,6 +21548,7 @@ ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: ; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -20741,16 +21556,16 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -20792,6 +21607,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -20825,6 +21641,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -20849,6 +21666,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -20856,16 +21674,16 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -20911,6 +21729,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -20950,6 +21769,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21001,6 +21821,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21054,6 +21875,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21099,6 +21921,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21152,6 +21975,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21196,6 +22020,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21234,6 +22059,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21284,6 +22110,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21336,6 +22163,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21380,6 +22208,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21432,6 +22261,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21458,12 +22288,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi656: +; NoVLX-NEXT: .Lcfi796: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi657: +; NoVLX-NEXT: .Lcfi797: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi658: +; NoVLX-NEXT: .Lcfi798: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -21481,6 +22311,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21501,12 +22332,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi659: +; NoVLX-NEXT: .Lcfi799: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi660: +; NoVLX-NEXT: .Lcfi800: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi661: +; NoVLX-NEXT: .Lcfi801: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -21524,6 +22355,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21546,15 +22378,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi662: +; NoVLX-NEXT: .Lcfi802: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi663: +; NoVLX-NEXT: .Lcfi803: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi664: +; NoVLX-NEXT: .Lcfi804: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -21562,10 +22395,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -21579,6 +22411,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21603,12 +22436,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi665: +; NoVLX-NEXT: .Lcfi805: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi666: +; NoVLX-NEXT: .Lcfi806: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi667: +; NoVLX-NEXT: .Lcfi807: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -21636,6 +22469,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21661,12 +22495,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi668: +; NoVLX-NEXT: .Lcfi808: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi669: +; NoVLX-NEXT: .Lcfi809: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi670: +; NoVLX-NEXT: .Lcfi810: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -21685,6 +22519,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21708,16 +22543,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi671: +; NoVLX-NEXT: .Lcfi811: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi672: +; NoVLX-NEXT: .Lcfi812: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi673: +; NoVLX-NEXT: .Lcfi813: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -21725,10 +22561,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -21742,6 +22577,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21768,20 +22604,20 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi674: +; NoVLX-NEXT: .Lcfi814: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi675: +; NoVLX-NEXT: .Lcfi815: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi676: +; NoVLX-NEXT: .Lcfi816: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -21797,6 +22633,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21817,20 +22654,20 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi677: +; NoVLX-NEXT: .Lcfi817: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi678: +; NoVLX-NEXT: .Lcfi818: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi679: +; NoVLX-NEXT: .Lcfi819: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -21846,6 +22683,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21868,12 +22706,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi680: +; NoVLX-NEXT: .Lcfi820: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi681: +; NoVLX-NEXT: .Lcfi821: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi682: +; NoVLX-NEXT: .Lcfi822: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -21890,8 +22728,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -21907,6 +22745,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21931,12 +22770,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi683: +; NoVLX-NEXT: .Lcfi823: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi684: +; NoVLX-NEXT: .Lcfi824: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi685: +; NoVLX-NEXT: .Lcfi825: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -21953,8 +22792,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -21970,6 +22809,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -21995,12 +22835,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi686: +; NoVLX-NEXT: .Lcfi826: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi687: +; NoVLX-NEXT: .Lcfi827: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi688: +; NoVLX-NEXT: .Lcfi828: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -22008,8 +22848,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -22025,6 +22865,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -22048,12 +22889,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi689: +; NoVLX-NEXT: .Lcfi829: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi690: +; NoVLX-NEXT: .Lcfi830: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi691: +; NoVLX-NEXT: .Lcfi831: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -22071,8 +22912,8 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -22088,6 +22929,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -22133,8 +22975,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -22151,6 +22993,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22190,8 +23033,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -22208,6 +23051,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22232,6 +23076,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -22249,7 +23094,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -22267,8 +23111,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -22285,6 +23129,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22311,6 +23156,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -22328,7 +23174,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -22346,8 +23191,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -22364,6 +23209,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22409,8 +23255,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -22427,6 +23273,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22453,6 +23300,7 @@ ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -22470,7 +23318,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -22488,8 +23335,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -22506,6 +23353,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22550,8 +23398,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -22568,6 +23416,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22606,8 +23455,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -22624,6 +23473,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22648,6 +23498,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -22665,7 +23516,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -22682,8 +23532,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -22700,6 +23550,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22726,6 +23577,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -22743,7 +23595,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -22760,8 +23611,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -22778,6 +23629,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22822,8 +23674,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -22840,6 +23692,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22866,6 +23719,7 @@ ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -22883,7 +23737,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -22900,8 +23753,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -22918,6 +23771,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22945,12 +23799,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi692: +; NoVLX-NEXT: .Lcfi832: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi693: +; NoVLX-NEXT: .Lcfi833: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi694: +; NoVLX-NEXT: .Lcfi834: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -22969,6 +23823,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -22990,12 +23845,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi695: +; NoVLX-NEXT: .Lcfi835: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi696: +; NoVLX-NEXT: .Lcfi836: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi697: +; NoVLX-NEXT: .Lcfi837: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23014,6 +23869,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -23037,16 +23893,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi698: +; NoVLX-NEXT: .Lcfi838: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi699: +; NoVLX-NEXT: .Lcfi839: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi700: +; NoVLX-NEXT: .Lcfi840: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -23064,7 +23921,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -23079,6 +23935,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -23104,16 +23961,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi701: +; NoVLX-NEXT: .Lcfi841: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi702: +; NoVLX-NEXT: .Lcfi842: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi703: +; NoVLX-NEXT: .Lcfi843: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -23131,7 +23989,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -23146,6 +24003,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -23172,12 +24030,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi704: +; NoVLX-NEXT: .Lcfi844: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi705: +; NoVLX-NEXT: .Lcfi845: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi706: +; NoVLX-NEXT: .Lcfi846: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23197,6 +24055,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -23221,17 +24080,18 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi707: +; NoVLX-NEXT: .Lcfi847: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi708: +; NoVLX-NEXT: .Lcfi848: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi709: +; NoVLX-NEXT: .Lcfi849: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -23249,7 +24109,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -23264,6 +24123,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -23291,12 +24151,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi710: +; NoVLX-NEXT: .Lcfi850: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi711: +; NoVLX-NEXT: .Lcfi851: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi712: +; NoVLX-NEXT: .Lcfi852: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -23304,8 +24164,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23321,6 +24181,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -23342,12 +24203,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi713: +; NoVLX-NEXT: .Lcfi853: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi714: +; NoVLX-NEXT: .Lcfi854: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi715: +; NoVLX-NEXT: .Lcfi855: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -23355,8 +24216,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23372,6 +24233,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -23395,12 +24257,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi716: +; NoVLX-NEXT: .Lcfi856: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi717: +; NoVLX-NEXT: .Lcfi857: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi718: +; NoVLX-NEXT: .Lcfi858: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -23408,6 +24270,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -23420,14 +24283,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23443,6 +24305,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -23468,12 +24331,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi719: +; NoVLX-NEXT: .Lcfi859: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi720: +; NoVLX-NEXT: .Lcfi860: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi721: +; NoVLX-NEXT: .Lcfi861: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -23481,6 +24344,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -23493,14 +24357,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23516,6 +24379,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -23542,12 +24406,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi722: +; NoVLX-NEXT: .Lcfi862: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi723: +; NoVLX-NEXT: .Lcfi863: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi724: +; NoVLX-NEXT: .Lcfi864: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -23556,8 +24420,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23573,6 +24437,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -23597,12 +24462,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi725: +; NoVLX-NEXT: .Lcfi865: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi726: +; NoVLX-NEXT: .Lcfi866: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi727: +; NoVLX-NEXT: .Lcfi867: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -23611,6 +24476,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -23623,14 +24489,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23646,6 +24511,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -23676,6 +24542,7 @@ ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -23700,6 +24567,7 @@ ; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -23727,6 +24595,7 @@ ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -23755,6 +24624,7 @@ ; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -23783,6 +24653,7 @@ ; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -23811,6 +24682,7 @@ ; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -23837,12 +24709,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi728: +; NoVLX-NEXT: .Lcfi868: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi729: +; NoVLX-NEXT: .Lcfi869: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi730: +; NoVLX-NEXT: .Lcfi870: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23889,6 +24761,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -23910,12 +24783,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi731: +; NoVLX-NEXT: .Lcfi871: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi732: +; NoVLX-NEXT: .Lcfi872: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi733: +; NoVLX-NEXT: .Lcfi873: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23962,6 +24835,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -23985,12 +24859,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi734: +; NoVLX-NEXT: .Lcfi874: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi735: +; NoVLX-NEXT: .Lcfi875: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi736: +; NoVLX-NEXT: .Lcfi876: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -24038,6 +24912,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -24062,12 +24937,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi737: +; NoVLX-NEXT: .Lcfi877: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi738: +; NoVLX-NEXT: .Lcfi878: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi739: +; NoVLX-NEXT: .Lcfi879: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -24115,6 +24990,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -24140,12 +25016,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi740: +; NoVLX-NEXT: .Lcfi880: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi741: +; NoVLX-NEXT: .Lcfi881: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi742: +; NoVLX-NEXT: .Lcfi882: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -24192,6 +25068,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -24216,12 +25093,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi743: +; NoVLX-NEXT: .Lcfi883: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi744: +; NoVLX-NEXT: .Lcfi884: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi745: +; NoVLX-NEXT: .Lcfi885: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -24269,6 +25146,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -24295,53 +25173,53 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi746: +; NoVLX-NEXT: .Lcfi886: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi747: +; NoVLX-NEXT: .Lcfi887: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi748: +; NoVLX-NEXT: .Lcfi888: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24352,6 +25230,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -24373,53 +25252,53 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi749: +; NoVLX-NEXT: .Lcfi889: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi750: +; NoVLX-NEXT: .Lcfi890: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi751: +; NoVLX-NEXT: .Lcfi891: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24430,6 +25309,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -24453,54 +25333,54 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi752: +; NoVLX-NEXT: .Lcfi892: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi753: +; NoVLX-NEXT: .Lcfi893: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi754: +; NoVLX-NEXT: .Lcfi894: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24511,6 +25391,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -24535,54 +25416,54 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi755: +; NoVLX-NEXT: .Lcfi895: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi756: +; NoVLX-NEXT: .Lcfi896: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi757: +; NoVLX-NEXT: .Lcfi897: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24593,6 +25474,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -24618,53 +25500,53 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi758: +; NoVLX-NEXT: .Lcfi898: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi759: +; NoVLX-NEXT: .Lcfi899: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi760: +; NoVLX-NEXT: .Lcfi900: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24675,6 +25557,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -24699,54 +25582,54 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi761: +; NoVLX-NEXT: .Lcfi901: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi762: +; NoVLX-NEXT: .Lcfi902: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi763: +; NoVLX-NEXT: .Lcfi903: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24757,6 +25640,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -24782,15 +25666,30 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi764: +; NoVLX-NEXT: .Lcfi904: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi765: +; NoVLX-NEXT: .Lcfi905: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi766: +; NoVLX-NEXT: .Lcfi906: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi907: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi908: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi909: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi910: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi911: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -24801,64 +25700,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -24867,8 +25766,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -24889,15 +25794,30 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi767: +; NoVLX-NEXT: .Lcfi912: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi768: +; NoVLX-NEXT: .Lcfi913: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi769: +; NoVLX-NEXT: .Lcfi914: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi915: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi916: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi917: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi918: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi919: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -24909,64 +25829,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -24975,8 +25895,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -24999,15 +25925,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi770: +; NoVLX-NEXT: .Lcfi920: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi771: +; NoVLX-NEXT: .Lcfi921: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi772: +; NoVLX-NEXT: .Lcfi922: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi923: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi924: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi925: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi926: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi927: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -25019,64 +25960,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -25085,8 +26026,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -25110,15 +26057,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi773: +; NoVLX-NEXT: .Lcfi928: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi774: +; NoVLX-NEXT: .Lcfi929: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi775: +; NoVLX-NEXT: .Lcfi930: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi931: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi932: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi933: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi934: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi935: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -25131,64 +26093,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -25197,8 +26159,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -25223,12 +26191,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi776: +; NoVLX-NEXT: .Lcfi936: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi777: +; NoVLX-NEXT: .Lcfi937: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi778: +; NoVLX-NEXT: .Lcfi938: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25237,15 +26205,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi779: +; NoVLX-NEXT: .Lcfi939: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi780: +; NoVLX-NEXT: .Lcfi940: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi781: +; NoVLX-NEXT: .Lcfi941: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi782: +; NoVLX-NEXT: .Lcfi942: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi783: +; NoVLX-NEXT: .Lcfi943: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -25253,6 +26221,10 @@ ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -25295,11 +26267,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -25311,15 +26283,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -25335,6 +26303,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -25355,12 +26324,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi784: +; NoVLX-NEXT: .Lcfi944: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi785: +; NoVLX-NEXT: .Lcfi945: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi786: +; NoVLX-NEXT: .Lcfi946: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25369,15 +26338,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi787: +; NoVLX-NEXT: .Lcfi947: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi788: +; NoVLX-NEXT: .Lcfi948: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi789: +; NoVLX-NEXT: .Lcfi949: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi790: +; NoVLX-NEXT: .Lcfi950: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi791: +; NoVLX-NEXT: .Lcfi951: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 @@ -25386,6 +26355,10 @@ ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -25428,11 +26401,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -25444,15 +26417,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -25468,6 +26437,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -25490,12 +26460,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi792: +; NoVLX-NEXT: .Lcfi952: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi793: +; NoVLX-NEXT: .Lcfi953: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi794: +; NoVLX-NEXT: .Lcfi954: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25504,15 +26474,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi795: +; NoVLX-NEXT: .Lcfi955: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi796: +; NoVLX-NEXT: .Lcfi956: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi797: +; NoVLX-NEXT: .Lcfi957: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi798: +; NoVLX-NEXT: .Lcfi958: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi799: +; NoVLX-NEXT: .Lcfi959: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -25521,6 +26491,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -25563,11 +26537,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -25579,15 +26553,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -25603,6 +26573,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -25626,12 +26597,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi800: +; NoVLX-NEXT: .Lcfi960: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi801: +; NoVLX-NEXT: .Lcfi961: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi802: +; NoVLX-NEXT: .Lcfi962: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25640,15 +26611,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi803: +; NoVLX-NEXT: .Lcfi963: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi804: +; NoVLX-NEXT: .Lcfi964: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi805: +; NoVLX-NEXT: .Lcfi965: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi806: +; NoVLX-NEXT: .Lcfi966: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi807: +; NoVLX-NEXT: .Lcfi967: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 @@ -25658,6 +26629,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -25700,11 +26675,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -25716,15 +26691,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -25740,6 +26711,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -25765,12 +26737,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi808: +; NoVLX-NEXT: .Lcfi968: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi809: +; NoVLX-NEXT: .Lcfi969: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi810: +; NoVLX-NEXT: .Lcfi970: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -25795,6 +26767,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -25816,12 +26789,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi811: +; NoVLX-NEXT: .Lcfi971: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi812: +; NoVLX-NEXT: .Lcfi972: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi813: +; NoVLX-NEXT: .Lcfi973: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -25847,6 +26820,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -25870,12 +26844,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi814: +; NoVLX-NEXT: .Lcfi974: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi815: +; NoVLX-NEXT: .Lcfi975: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi816: +; NoVLX-NEXT: .Lcfi976: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -25909,6 +26883,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -25933,12 +26908,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi817: +; NoVLX-NEXT: .Lcfi977: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi818: +; NoVLX-NEXT: .Lcfi978: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi819: +; NoVLX-NEXT: .Lcfi979: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -25973,6 +26948,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -26007,6 +26983,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26038,6 +27015,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26071,6 +27049,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26106,6 +27085,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26130,12 +27110,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi820: +; NoVLX-NEXT: .Lcfi980: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi821: +; NoVLX-NEXT: .Lcfi981: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi822: +; NoVLX-NEXT: .Lcfi982: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -26187,6 +27167,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26207,12 +27188,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi823: +; NoVLX-NEXT: .Lcfi983: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi824: +; NoVLX-NEXT: .Lcfi984: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi825: +; NoVLX-NEXT: .Lcfi985: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -26265,6 +27246,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26287,12 +27269,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi826: +; NoVLX-NEXT: .Lcfi986: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi827: +; NoVLX-NEXT: .Lcfi987: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi828: +; NoVLX-NEXT: .Lcfi988: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -26345,6 +27327,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26368,12 +27351,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi829: +; NoVLX-NEXT: .Lcfi989: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi830: +; NoVLX-NEXT: .Lcfi990: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi831: +; NoVLX-NEXT: .Lcfi991: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -26427,6 +27410,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26451,12 +27435,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi832: +; NoVLX-NEXT: .Lcfi992: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi833: +; NoVLX-NEXT: .Lcfi993: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi834: +; NoVLX-NEXT: .Lcfi994: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -26466,43 +27450,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26513,6 +27497,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26533,12 +27518,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi835: +; NoVLX-NEXT: .Lcfi995: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi836: +; NoVLX-NEXT: .Lcfi996: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi837: +; NoVLX-NEXT: .Lcfi997: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -26549,43 +27534,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26596,6 +27581,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26618,12 +27604,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi838: +; NoVLX-NEXT: .Lcfi998: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi839: +; NoVLX-NEXT: .Lcfi999: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi840: +; NoVLX-NEXT: .Lcfi1000: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -26634,43 +27620,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26681,6 +27667,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26704,12 +27691,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi841: +; NoVLX-NEXT: .Lcfi1001: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi842: +; NoVLX-NEXT: .Lcfi1002: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi843: +; NoVLX-NEXT: .Lcfi1003: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -26721,43 +27708,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26768,6 +27755,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -26793,15 +27781,30 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi844: +; NoVLX-NEXT: .Lcfi1004: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi845: +; NoVLX-NEXT: .Lcfi1005: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi846: +; NoVLX-NEXT: .Lcfi1006: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1007: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1008: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1009: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1010: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1011: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 @@ -26812,64 +27815,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -26878,8 +27881,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -26901,15 +27910,30 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi847: +; NoVLX-NEXT: .Lcfi1012: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi848: +; NoVLX-NEXT: .Lcfi1013: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi849: +; NoVLX-NEXT: .Lcfi1014: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1015: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1016: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1017: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1018: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1019: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -26921,64 +27945,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -26987,8 +28011,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -27012,15 +28042,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi850: +; NoVLX-NEXT: .Lcfi1020: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi851: +; NoVLX-NEXT: .Lcfi1021: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi852: +; NoVLX-NEXT: .Lcfi1022: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1023: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1024: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1025: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1026: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1027: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 @@ -27032,64 +28077,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -27098,8 +28143,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -27124,15 +28175,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi853: +; NoVLX-NEXT: .Lcfi1028: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi854: +; NoVLX-NEXT: .Lcfi1029: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi855: +; NoVLX-NEXT: .Lcfi1030: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1031: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1032: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1033: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1034: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1035: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -27145,64 +28211,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -27211,8 +28277,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -27238,12 +28310,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi856: +; NoVLX-NEXT: .Lcfi1036: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi857: +; NoVLX-NEXT: .Lcfi1037: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi858: +; NoVLX-NEXT: .Lcfi1038: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27252,15 +28324,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi859: +; NoVLX-NEXT: .Lcfi1039: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi860: +; NoVLX-NEXT: .Lcfi1040: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi861: +; NoVLX-NEXT: .Lcfi1041: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi862: +; NoVLX-NEXT: .Lcfi1042: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi863: +; NoVLX-NEXT: .Lcfi1043: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -27268,6 +28340,10 @@ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -27310,11 +28386,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -27326,15 +28402,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -27350,6 +28422,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -27371,12 +28444,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi864: +; NoVLX-NEXT: .Lcfi1044: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi865: +; NoVLX-NEXT: .Lcfi1045: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi866: +; NoVLX-NEXT: .Lcfi1046: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27385,15 +28458,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi867: +; NoVLX-NEXT: .Lcfi1047: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi868: +; NoVLX-NEXT: .Lcfi1048: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi869: +; NoVLX-NEXT: .Lcfi1049: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi870: +; NoVLX-NEXT: .Lcfi1050: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi871: +; NoVLX-NEXT: .Lcfi1051: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 @@ -27402,6 +28475,10 @@ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -27444,11 +28521,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -27460,15 +28537,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -27484,6 +28557,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -27507,12 +28581,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi872: +; NoVLX-NEXT: .Lcfi1052: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi873: +; NoVLX-NEXT: .Lcfi1053: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi874: +; NoVLX-NEXT: .Lcfi1054: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27521,15 +28595,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi875: +; NoVLX-NEXT: .Lcfi1055: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi876: +; NoVLX-NEXT: .Lcfi1056: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi877: +; NoVLX-NEXT: .Lcfi1057: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi878: +; NoVLX-NEXT: .Lcfi1058: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi879: +; NoVLX-NEXT: .Lcfi1059: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -27538,6 +28612,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -27580,11 +28658,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -27596,15 +28674,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -27620,6 +28694,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -27644,12 +28719,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi880: +; NoVLX-NEXT: .Lcfi1060: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi881: +; NoVLX-NEXT: .Lcfi1061: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi882: +; NoVLX-NEXT: .Lcfi1062: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27658,15 +28733,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi883: +; NoVLX-NEXT: .Lcfi1063: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi884: +; NoVLX-NEXT: .Lcfi1064: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi885: +; NoVLX-NEXT: .Lcfi1065: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi886: +; NoVLX-NEXT: .Lcfi1066: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi887: +; NoVLX-NEXT: .Lcfi1067: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 @@ -27676,6 +28751,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -27718,11 +28797,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -27734,15 +28813,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -27758,6 +28833,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -27783,58 +28859,62 @@ ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi888: +; NoVLX-NEXT: .Lcfi1068: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi889: +; NoVLX-NEXT: .Lcfi1069: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi890: +; NoVLX-NEXT: .Lcfi1070: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm5 @@ -27842,82 +28922,79 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm5 -; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 -; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm5 -; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm6 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6 -; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm6 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm2 @@ -27925,7 +29002,7 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx @@ -27933,35 +29010,34 @@ ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm1, %rax -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vmovd %eax, %xmm7 -; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm7, %xmm7 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm7, %xmm7 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rcx -; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm3 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm0 +; NoVLX-NEXT: vmovq %xmm8, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $3, %eax, %xmm7, %xmm4 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm4, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm2 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -28116,6 +29192,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -28137,68 +29214,69 @@ ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi891: +; NoVLX-NEXT: .Lcfi1071: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi892: +; NoVLX-NEXT: .Lcfi1072: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi893: +; NoVLX-NEXT: .Lcfi1073: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm1 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 @@ -28206,8 +29284,7 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax @@ -28217,24 +29294,24 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 -; NoVLX-NEXT: vmovdqa (%rdi), %ymm0 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm2 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 @@ -28387,6 +29464,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -28410,12 +29488,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi894: +; NoVLX-NEXT: .Lcfi1074: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi895: +; NoVLX-NEXT: .Lcfi1075: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi896: +; NoVLX-NEXT: .Lcfi1076: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -28426,12 +29504,17 @@ ; NoVLX-NEXT: movq %rax, %rdx ; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 ; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -28439,61 +29522,59 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 @@ -28501,8 +29582,7 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax @@ -28512,30 +29592,29 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm7 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 @@ -28543,18 +29622,47 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm7 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vmovq %xmm8, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx @@ -28566,31 +29674,7 @@ ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm1, %rax -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vmovd %eax, %xmm7 -; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm7, %xmm7 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm7, %xmm7 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rcx -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $3, %eax, %xmm7, %xmm0 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm2, %ymm2 ; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 ; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2 @@ -28657,83 +29741,77 @@ ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 -; NoVLX-NEXT: vpxor %ymm4, %ymm1, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 @@ -28752,6 +29830,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -28776,12 +29855,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi897: +; NoVLX-NEXT: .Lcfi1077: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi898: +; NoVLX-NEXT: .Lcfi1078: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi899: +; NoVLX-NEXT: .Lcfi1079: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -28793,6 +29872,8 @@ ; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax @@ -28805,20 +29886,19 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -28826,7 +29906,6 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 @@ -28868,160 +29947,160 @@ ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; NoVLX-NEXT: vmovdqa (%rsi), %ymm3 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 -; NoVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm4 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm5 +; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm3 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm3 +; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; NoVLX-NEXT: vpxor %ymm4, %ymm5, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm4 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2 -; NoVLX-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -29035,6 +30114,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -29078,8 +30158,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -29096,6 +30176,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29136,8 +30217,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -29154,6 +30235,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29176,6 +30258,7 @@ ; ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask: ; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -29187,14 +30270,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -29211,8 +30293,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -29229,6 +30311,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29254,6 +30337,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -29265,14 +30349,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -29289,8 +30372,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -29307,6 +30390,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29353,8 +30437,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -29371,6 +30455,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29396,6 +30481,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -29407,14 +30493,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -29431,8 +30516,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -29449,6 +30534,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29493,8 +30579,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -29511,6 +30597,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29550,8 +30637,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -29568,6 +30655,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29590,6 +30678,7 @@ ; ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask: ; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -29601,14 +30690,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -29624,8 +30712,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -29642,6 +30730,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29667,6 +30756,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -29678,14 +30768,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -29701,8 +30790,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -29719,6 +30808,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29764,8 +30854,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -29782,6 +30872,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29807,6 +30898,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -29818,14 +30910,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -29841,8 +30932,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -29859,6 +30950,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29885,12 +30977,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi900: +; NoVLX-NEXT: .Lcfi1080: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi901: +; NoVLX-NEXT: .Lcfi1081: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi902: +; NoVLX-NEXT: .Lcfi1082: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -29910,6 +31002,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29930,12 +31023,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi903: +; NoVLX-NEXT: .Lcfi1083: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi904: +; NoVLX-NEXT: .Lcfi1084: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi905: +; NoVLX-NEXT: .Lcfi1085: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -29956,6 +31049,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -29978,15 +31072,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi906: +; NoVLX-NEXT: .Lcfi1086: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi907: +; NoVLX-NEXT: .Lcfi1087: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi908: +; NoVLX-NEXT: .Lcfi1088: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -29998,14 +31093,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -30019,6 +31113,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -30043,16 +31138,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi909: +; NoVLX-NEXT: .Lcfi1089: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi910: +; NoVLX-NEXT: .Lcfi1090: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi911: +; NoVLX-NEXT: .Lcfi1091: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -30064,14 +31160,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -30085,6 +31180,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -30111,12 +31207,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi912: +; NoVLX-NEXT: .Lcfi1092: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi913: +; NoVLX-NEXT: .Lcfi1093: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi914: +; NoVLX-NEXT: .Lcfi1094: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -30137,6 +31233,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -30161,16 +31258,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi915: +; NoVLX-NEXT: .Lcfi1095: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi916: +; NoVLX-NEXT: .Lcfi1096: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi917: +; NoVLX-NEXT: .Lcfi1097: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -30182,14 +31280,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -30203,6 +31300,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -30229,12 +31327,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi918: +; NoVLX-NEXT: .Lcfi1098: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi919: +; NoVLX-NEXT: .Lcfi1099: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi920: +; NoVLX-NEXT: .Lcfi1100: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -30243,8 +31341,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30260,6 +31358,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -30280,12 +31379,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi921: +; NoVLX-NEXT: .Lcfi1101: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi922: +; NoVLX-NEXT: .Lcfi1102: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi923: +; NoVLX-NEXT: .Lcfi1103: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -30295,8 +31394,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30312,6 +31411,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -30334,18 +31434,19 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi924: +; NoVLX-NEXT: .Lcfi1104: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi925: +; NoVLX-NEXT: .Lcfi1105: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi926: +; NoVLX-NEXT: .Lcfi1106: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -30358,14 +31459,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30381,6 +31481,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -30405,12 +31506,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi927: +; NoVLX-NEXT: .Lcfi1107: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi928: +; NoVLX-NEXT: .Lcfi1108: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi929: +; NoVLX-NEXT: .Lcfi1109: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -30418,6 +31519,7 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -30430,14 +31532,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30453,6 +31554,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -30479,12 +31581,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi930: +; NoVLX-NEXT: .Lcfi1110: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi931: +; NoVLX-NEXT: .Lcfi1111: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi932: +; NoVLX-NEXT: .Lcfi1112: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -30494,8 +31596,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30511,6 +31613,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -30535,12 +31638,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi933: +; NoVLX-NEXT: .Lcfi1113: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi934: +; NoVLX-NEXT: .Lcfi1114: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi935: +; NoVLX-NEXT: .Lcfi1115: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -30548,6 +31651,7 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -30560,14 +31664,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30583,6 +31686,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -30617,6 +31721,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -30645,6 +31750,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -30676,6 +31782,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -30708,6 +31815,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -30741,6 +31849,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -30774,6 +31883,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -30800,12 +31910,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi936: +; NoVLX-NEXT: .Lcfi1116: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi937: +; NoVLX-NEXT: .Lcfi1117: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi938: +; NoVLX-NEXT: .Lcfi1118: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -30854,6 +31964,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -30875,12 +31986,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi939: +; NoVLX-NEXT: .Lcfi1119: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi940: +; NoVLX-NEXT: .Lcfi1120: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi941: +; NoVLX-NEXT: .Lcfi1121: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -30929,6 +32040,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -30952,12 +32064,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi942: +; NoVLX-NEXT: .Lcfi1122: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi943: +; NoVLX-NEXT: .Lcfi1123: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi944: +; NoVLX-NEXT: .Lcfi1124: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -31008,6 +32120,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -31032,12 +32145,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi945: +; NoVLX-NEXT: .Lcfi1125: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi946: +; NoVLX-NEXT: .Lcfi1126: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi947: +; NoVLX-NEXT: .Lcfi1127: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -31088,6 +32201,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -31114,12 +32228,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi948: +; NoVLX-NEXT: .Lcfi1128: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi949: +; NoVLX-NEXT: .Lcfi1129: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi950: +; NoVLX-NEXT: .Lcfi1130: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -31168,6 +32282,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -31193,12 +32308,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi951: +; NoVLX-NEXT: .Lcfi1131: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi952: +; NoVLX-NEXT: .Lcfi1132: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi953: +; NoVLX-NEXT: .Lcfi1133: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -31249,6 +32364,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -31275,55 +32391,55 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi954: +; NoVLX-NEXT: .Lcfi1134: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi955: +; NoVLX-NEXT: .Lcfi1135: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi956: +; NoVLX-NEXT: .Lcfi1136: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31334,6 +32450,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -31355,55 +32472,55 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi957: +; NoVLX-NEXT: .Lcfi1137: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi958: +; NoVLX-NEXT: .Lcfi1138: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi959: +; NoVLX-NEXT: .Lcfi1139: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31414,6 +32531,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -31437,12 +32555,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi960: +; NoVLX-NEXT: .Lcfi1140: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi961: +; NoVLX-NEXT: .Lcfi1141: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi962: +; NoVLX-NEXT: .Lcfi1142: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -31451,43 +32569,43 @@ ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31498,6 +32616,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -31522,12 +32641,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi963: +; NoVLX-NEXT: .Lcfi1143: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi964: +; NoVLX-NEXT: .Lcfi1144: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi965: +; NoVLX-NEXT: .Lcfi1145: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -31536,43 +32655,43 @@ ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31583,6 +32702,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -31609,55 +32729,55 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi966: +; NoVLX-NEXT: .Lcfi1146: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi967: +; NoVLX-NEXT: .Lcfi1147: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi968: +; NoVLX-NEXT: .Lcfi1148: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31668,6 +32788,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -31693,12 +32814,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi969: +; NoVLX-NEXT: .Lcfi1149: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi970: +; NoVLX-NEXT: .Lcfi1150: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi971: +; NoVLX-NEXT: .Lcfi1151: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -31707,43 +32828,43 @@ ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31754,6 +32875,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -31780,78 +32902,93 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi972: +; NoVLX-NEXT: .Lcfi1152: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi973: +; NoVLX-NEXT: .Lcfi1153: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi974: +; NoVLX-NEXT: .Lcfi1154: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1155: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1156: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1157: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1158: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1159: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -31860,8 +32997,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -31883,78 +33026,93 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi975: +; NoVLX-NEXT: .Lcfi1160: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi976: +; NoVLX-NEXT: .Lcfi1161: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi977: +; NoVLX-NEXT: .Lcfi1162: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1163: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1164: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1165: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1166: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1167: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -31963,8 +33121,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -31988,79 +33152,94 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi978: +; NoVLX-NEXT: .Lcfi1168: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi979: +; NoVLX-NEXT: .Lcfi1169: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi980: +; NoVLX-NEXT: .Lcfi1170: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1171: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1172: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1173: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1174: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1175: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -32069,8 +33248,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -32095,79 +33280,94 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi981: +; NoVLX-NEXT: .Lcfi1176: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi982: +; NoVLX-NEXT: .Lcfi1177: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi983: +; NoVLX-NEXT: .Lcfi1178: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1179: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1180: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1181: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1182: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1183: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -32176,8 +33376,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -32204,79 +33410,94 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi984: +; NoVLX-NEXT: .Lcfi1184: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi985: +; NoVLX-NEXT: .Lcfi1185: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi986: +; NoVLX-NEXT: .Lcfi1186: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1187: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1188: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1189: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1190: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1191: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -32285,8 +33506,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -32312,15 +33539,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi987: +; NoVLX-NEXT: .Lcfi1192: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi988: +; NoVLX-NEXT: .Lcfi1193: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi989: +; NoVLX-NEXT: .Lcfi1194: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1195: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1196: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1197: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1198: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1199: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} @@ -32328,64 +33570,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -32394,8 +33636,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -32422,12 +33670,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi990: +; NoVLX-NEXT: .Lcfi1200: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi991: +; NoVLX-NEXT: .Lcfi1201: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi992: +; NoVLX-NEXT: .Lcfi1202: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32436,17 +33684,150 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi993: +; NoVLX-NEXT: .Lcfi1203: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi994: +; NoVLX-NEXT: .Lcfi1204: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi995: +; NoVLX-NEXT: .Lcfi1205: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi996: +; NoVLX-NEXT: .Lcfi1206: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi997: +; NoVLX-NEXT: .Lcfi1207: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1208: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1209: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1210: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1211: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1212: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1213: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1214: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1215: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -32489,139 +33870,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, (%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <8 x i64> %__a to <16 x i32> - %1 = bitcast <8 x i64> %__b to <16 x i32> - %2 = icmp sge <16 x i32> %0, %1 - %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> - %4 = bitcast <64 x i1> %3 to i64 - ret i64 %4 -} - -define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: -; VLX: # BB#0: # %entry -; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 -; VLX-NEXT: kmovq %k0, %rax -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi998: -; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi999: -; NoVLX-NEXT: .cfi_offset %rbp, -16 -; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1000: -; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx -; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1001: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1002: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1003: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1004: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1005: -; NoVLX-NEXT: .cfi_offset %r15, -24 -; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r10d -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d -; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d -; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi -; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx -; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi -; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx -; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -32633,15 +33886,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -32657,6 +33906,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -32680,12 +33930,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1006: +; NoVLX-NEXT: .Lcfi1216: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1007: +; NoVLX-NEXT: .Lcfi1217: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1008: +; NoVLX-NEXT: .Lcfi1218: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32694,18 +33944,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1009: +; NoVLX-NEXT: .Lcfi1219: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1010: +; NoVLX-NEXT: .Lcfi1220: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1011: +; NoVLX-NEXT: .Lcfi1221: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1012: +; NoVLX-NEXT: .Lcfi1222: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1013: +; NoVLX-NEXT: .Lcfi1223: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -32748,11 +34002,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -32764,15 +34018,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -32788,6 +34038,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -32812,12 +34063,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1014: +; NoVLX-NEXT: .Lcfi1224: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1015: +; NoVLX-NEXT: .Lcfi1225: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1016: +; NoVLX-NEXT: .Lcfi1226: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32826,18 +34077,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1017: +; NoVLX-NEXT: .Lcfi1227: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1018: +; NoVLX-NEXT: .Lcfi1228: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1019: +; NoVLX-NEXT: .Lcfi1229: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1020: +; NoVLX-NEXT: .Lcfi1230: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1021: +; NoVLX-NEXT: .Lcfi1231: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -32880,11 +34135,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -32896,15 +34151,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -32920,6 +34171,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -32946,12 +34198,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1022: +; NoVLX-NEXT: .Lcfi1232: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1023: +; NoVLX-NEXT: .Lcfi1233: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1024: +; NoVLX-NEXT: .Lcfi1234: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32960,18 +34212,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1025: +; NoVLX-NEXT: .Lcfi1235: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1026: +; NoVLX-NEXT: .Lcfi1236: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1027: +; NoVLX-NEXT: .Lcfi1237: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1028: +; NoVLX-NEXT: .Lcfi1238: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1029: +; NoVLX-NEXT: .Lcfi1239: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -33014,11 +34270,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -33030,15 +34286,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -33054,6 +34306,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -33079,12 +34332,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1030: +; NoVLX-NEXT: .Lcfi1240: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1031: +; NoVLX-NEXT: .Lcfi1241: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1032: +; NoVLX-NEXT: .Lcfi1242: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -33093,19 +34346,23 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1033: +; NoVLX-NEXT: .Lcfi1243: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1034: +; NoVLX-NEXT: .Lcfi1244: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1035: +; NoVLX-NEXT: .Lcfi1245: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1036: +; NoVLX-NEXT: .Lcfi1246: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1037: +; NoVLX-NEXT: .Lcfi1247: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -33148,11 +34405,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -33164,15 +34421,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -33188,6 +34441,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -33222,6 +34476,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33252,6 +34507,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33274,6 +34530,7 @@ ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: ; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -33281,16 +34538,16 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33316,6 +34573,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -33323,16 +34581,16 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33369,6 +34627,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33394,6 +34653,7 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -33401,16 +34661,16 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33458,6 +34718,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33500,6 +34761,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33551,6 +34813,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33605,6 +34868,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33653,6 +34917,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33707,6 +34972,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33753,6 +35019,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33794,6 +35061,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33844,6 +35112,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33897,6 +35166,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33944,6 +35214,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -33997,6 +35268,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34023,12 +35295,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1038: +; NoVLX-NEXT: .Lcfi1248: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1039: +; NoVLX-NEXT: .Lcfi1249: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1040: +; NoVLX-NEXT: .Lcfi1250: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -34048,6 +35320,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34068,12 +35341,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1041: +; NoVLX-NEXT: .Lcfi1251: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1042: +; NoVLX-NEXT: .Lcfi1252: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1043: +; NoVLX-NEXT: .Lcfi1253: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -34094,6 +35367,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34116,15 +35390,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1044: +; NoVLX-NEXT: .Lcfi1254: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1045: +; NoVLX-NEXT: .Lcfi1255: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1046: +; NoVLX-NEXT: .Lcfi1256: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -34132,10 +35407,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -34149,6 +35423,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34173,16 +35448,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1047: +; NoVLX-NEXT: .Lcfi1257: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1048: +; NoVLX-NEXT: .Lcfi1258: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1049: +; NoVLX-NEXT: .Lcfi1259: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -34190,10 +35466,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -34207,6 +35482,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34233,12 +35509,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1050: +; NoVLX-NEXT: .Lcfi1260: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1051: +; NoVLX-NEXT: .Lcfi1261: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1052: +; NoVLX-NEXT: .Lcfi1262: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -34259,6 +35535,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34283,16 +35560,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1053: +; NoVLX-NEXT: .Lcfi1263: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1054: +; NoVLX-NEXT: .Lcfi1264: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1055: +; NoVLX-NEXT: .Lcfi1265: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -34300,10 +35578,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -34317,6 +35594,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34343,12 +35621,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1056: +; NoVLX-NEXT: .Lcfi1266: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1057: +; NoVLX-NEXT: .Lcfi1267: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1058: +; NoVLX-NEXT: .Lcfi1268: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34357,8 +35635,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34374,6 +35652,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34394,12 +35673,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1059: +; NoVLX-NEXT: .Lcfi1269: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1060: +; NoVLX-NEXT: .Lcfi1270: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1061: +; NoVLX-NEXT: .Lcfi1271: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34409,8 +35688,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34426,6 +35705,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34448,12 +35728,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1062: +; NoVLX-NEXT: .Lcfi1272: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1063: +; NoVLX-NEXT: .Lcfi1273: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1064: +; NoVLX-NEXT: .Lcfi1274: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34470,8 +35750,8 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34487,6 +35767,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34511,12 +35792,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1065: +; NoVLX-NEXT: .Lcfi1275: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1066: +; NoVLX-NEXT: .Lcfi1276: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1067: +; NoVLX-NEXT: .Lcfi1277: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34534,8 +35815,8 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34551,6 +35832,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34577,12 +35859,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1068: +; NoVLX-NEXT: .Lcfi1278: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1069: +; NoVLX-NEXT: .Lcfi1279: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1070: +; NoVLX-NEXT: .Lcfi1280: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34592,8 +35874,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34609,6 +35891,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34633,12 +35916,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1071: +; NoVLX-NEXT: .Lcfi1281: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1072: +; NoVLX-NEXT: .Lcfi1282: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1073: +; NoVLX-NEXT: .Lcfi1283: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34656,8 +35939,8 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34673,6 +35956,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -34720,8 +36004,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -34738,6 +36022,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -34780,8 +36065,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -34798,6 +36083,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -34824,6 +36110,7 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -34841,7 +36128,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -34859,8 +36145,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -34877,6 +36163,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -34906,6 +36193,7 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -34923,7 +36211,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -34941,8 +36228,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -34959,6 +36246,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35007,8 +36295,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -35025,6 +36313,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35054,6 +36343,7 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -35071,7 +36361,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -35089,8 +36378,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -35107,6 +36396,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35153,8 +36443,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -35171,6 +36461,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35212,8 +36503,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -35230,6 +36521,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35256,6 +36548,7 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -35273,7 +36566,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -35290,8 +36582,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -35308,6 +36600,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35337,6 +36630,7 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -35354,7 +36648,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -35371,8 +36664,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -35389,6 +36682,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35436,8 +36730,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -35454,6 +36748,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35483,6 +36778,7 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -35500,7 +36796,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -35517,8 +36812,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -35535,6 +36830,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35562,12 +36858,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1074: +; NoVLX-NEXT: .Lcfi1284: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1075: +; NoVLX-NEXT: .Lcfi1285: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1076: +; NoVLX-NEXT: .Lcfi1286: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35588,6 +36884,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35609,12 +36906,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1077: +; NoVLX-NEXT: .Lcfi1287: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1078: +; NoVLX-NEXT: .Lcfi1288: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1079: +; NoVLX-NEXT: .Lcfi1289: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35636,6 +36933,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35659,18 +36957,19 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1080: +; NoVLX-NEXT: .Lcfi1290: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1081: +; NoVLX-NEXT: .Lcfi1291: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1082: +; NoVLX-NEXT: .Lcfi1292: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -35688,7 +36987,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -35703,6 +37001,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35728,12 +37027,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1083: +; NoVLX-NEXT: .Lcfi1293: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1084: +; NoVLX-NEXT: .Lcfi1294: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1085: +; NoVLX-NEXT: .Lcfi1295: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35741,6 +37040,7 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -35758,7 +37058,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -35773,6 +37072,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35800,12 +37100,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1086: +; NoVLX-NEXT: .Lcfi1296: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1087: +; NoVLX-NEXT: .Lcfi1297: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1088: +; NoVLX-NEXT: .Lcfi1298: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35827,6 +37127,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35852,12 +37153,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1089: +; NoVLX-NEXT: .Lcfi1299: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1090: +; NoVLX-NEXT: .Lcfi1300: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1091: +; NoVLX-NEXT: .Lcfi1301: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35865,6 +37166,7 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -35882,7 +37184,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -35897,6 +37198,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35924,12 +37226,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1092: +; NoVLX-NEXT: .Lcfi1302: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1093: +; NoVLX-NEXT: .Lcfi1303: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1094: +; NoVLX-NEXT: .Lcfi1304: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -35939,8 +37241,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -35956,6 +37258,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -35977,12 +37280,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1095: +; NoVLX-NEXT: .Lcfi1305: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1096: +; NoVLX-NEXT: .Lcfi1306: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1097: +; NoVLX-NEXT: .Lcfi1307: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -35993,8 +37296,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -36010,6 +37313,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -36033,12 +37337,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1098: +; NoVLX-NEXT: .Lcfi1308: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1099: +; NoVLX-NEXT: .Lcfi1309: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1100: +; NoVLX-NEXT: .Lcfi1310: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -36048,6 +37352,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -36060,14 +37365,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -36083,6 +37387,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -36108,12 +37413,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1101: +; NoVLX-NEXT: .Lcfi1311: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1102: +; NoVLX-NEXT: .Lcfi1312: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1103: +; NoVLX-NEXT: .Lcfi1313: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -36124,6 +37429,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -36136,14 +37442,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -36159,6 +37464,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -36186,12 +37492,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1104: +; NoVLX-NEXT: .Lcfi1314: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1105: +; NoVLX-NEXT: .Lcfi1315: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1106: +; NoVLX-NEXT: .Lcfi1316: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -36202,8 +37508,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -36219,6 +37525,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -36244,12 +37551,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1107: +; NoVLX-NEXT: .Lcfi1317: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1108: +; NoVLX-NEXT: .Lcfi1318: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1109: +; NoVLX-NEXT: .Lcfi1319: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -36260,6 +37567,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -36272,14 +37580,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -36295,6 +37602,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -36325,6 +37633,7 @@ ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36349,6 +37658,7 @@ ; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36376,6 +37686,7 @@ ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36404,6 +37715,7 @@ ; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36434,6 +37746,7 @@ ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36464,6 +37777,7 @@ ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36490,12 +37804,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1110: +; NoVLX-NEXT: .Lcfi1320: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1111: +; NoVLX-NEXT: .Lcfi1321: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1112: +; NoVLX-NEXT: .Lcfi1322: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36542,6 +37856,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36563,12 +37878,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1113: +; NoVLX-NEXT: .Lcfi1323: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1114: +; NoVLX-NEXT: .Lcfi1324: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1115: +; NoVLX-NEXT: .Lcfi1325: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36615,6 +37930,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36638,12 +37954,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1116: +; NoVLX-NEXT: .Lcfi1326: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1117: +; NoVLX-NEXT: .Lcfi1327: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1118: +; NoVLX-NEXT: .Lcfi1328: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36691,6 +38007,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36715,12 +38032,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1119: +; NoVLX-NEXT: .Lcfi1329: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1120: +; NoVLX-NEXT: .Lcfi1330: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1121: +; NoVLX-NEXT: .Lcfi1331: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36768,6 +38085,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36794,12 +38112,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1122: +; NoVLX-NEXT: .Lcfi1332: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1123: +; NoVLX-NEXT: .Lcfi1333: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1124: +; NoVLX-NEXT: .Lcfi1334: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36847,6 +38165,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36872,12 +38191,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1125: +; NoVLX-NEXT: .Lcfi1335: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1126: +; NoVLX-NEXT: .Lcfi1336: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1127: +; NoVLX-NEXT: .Lcfi1337: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36926,6 +38245,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -36952,53 +38272,53 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1128: +; NoVLX-NEXT: .Lcfi1338: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1129: +; NoVLX-NEXT: .Lcfi1339: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1130: +; NoVLX-NEXT: .Lcfi1340: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37009,6 +38329,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -37030,53 +38351,53 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1131: +; NoVLX-NEXT: .Lcfi1341: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1132: +; NoVLX-NEXT: .Lcfi1342: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1133: +; NoVLX-NEXT: .Lcfi1343: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37087,6 +38408,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -37110,54 +38432,54 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1134: +; NoVLX-NEXT: .Lcfi1344: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1135: +; NoVLX-NEXT: .Lcfi1345: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1136: +; NoVLX-NEXT: .Lcfi1346: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37168,6 +38490,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -37192,54 +38515,54 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1137: +; NoVLX-NEXT: .Lcfi1347: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1138: +; NoVLX-NEXT: .Lcfi1348: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1139: +; NoVLX-NEXT: .Lcfi1349: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37250,6 +38573,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -37276,54 +38600,54 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1140: +; NoVLX-NEXT: .Lcfi1350: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1141: +; NoVLX-NEXT: .Lcfi1351: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1142: +; NoVLX-NEXT: .Lcfi1352: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37334,6 +38658,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -37359,55 +38684,55 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1143: +; NoVLX-NEXT: .Lcfi1353: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1144: +; NoVLX-NEXT: .Lcfi1354: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1145: +; NoVLX-NEXT: .Lcfi1355: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37418,6 +38743,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -37443,15 +38769,30 @@ ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1146: +; NoVLX-NEXT: .Lcfi1356: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1147: +; NoVLX-NEXT: .Lcfi1357: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1148: +; NoVLX-NEXT: .Lcfi1358: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1359: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1360: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1361: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1362: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1363: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 @@ -37463,64 +38804,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -37529,8 +38870,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -37551,15 +38898,30 @@ ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1149: +; NoVLX-NEXT: .Lcfi1364: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1150: +; NoVLX-NEXT: .Lcfi1365: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1151: +; NoVLX-NEXT: .Lcfi1366: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1367: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1368: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1369: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1370: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1371: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 @@ -37571,64 +38933,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -37637,8 +38999,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -37661,15 +39029,30 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1152: +; NoVLX-NEXT: .Lcfi1372: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1153: +; NoVLX-NEXT: .Lcfi1373: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1154: +; NoVLX-NEXT: .Lcfi1374: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1375: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1376: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1377: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1378: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1379: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 @@ -37682,64 +39065,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -37748,8 +39131,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -37773,15 +39162,30 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1155: +; NoVLX-NEXT: .Lcfi1380: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1156: +; NoVLX-NEXT: .Lcfi1381: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1157: +; NoVLX-NEXT: .Lcfi1382: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1383: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1384: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1385: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1386: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1387: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 @@ -37794,64 +39198,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -37860,8 +39264,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -37886,12 +39296,12 @@ ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1158: +; NoVLX-NEXT: .Lcfi1388: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1159: +; NoVLX-NEXT: .Lcfi1389: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1160: +; NoVLX-NEXT: .Lcfi1390: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -37900,15 +39310,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1161: +; NoVLX-NEXT: .Lcfi1391: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1162: +; NoVLX-NEXT: .Lcfi1392: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1163: +; NoVLX-NEXT: .Lcfi1393: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1164: +; NoVLX-NEXT: .Lcfi1394: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1165: +; NoVLX-NEXT: .Lcfi1395: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -37917,6 +39327,10 @@ ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -37959,11 +39373,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -37975,15 +39389,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37999,6 +39409,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -38019,12 +39430,12 @@ ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1166: +; NoVLX-NEXT: .Lcfi1396: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1167: +; NoVLX-NEXT: .Lcfi1397: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1168: +; NoVLX-NEXT: .Lcfi1398: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -38033,23 +39444,27 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1169: +; NoVLX-NEXT: .Lcfi1399: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1170: +; NoVLX-NEXT: .Lcfi1400: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1171: +; NoVLX-NEXT: .Lcfi1401: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1172: +; NoVLX-NEXT: .Lcfi1402: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1173: +; NoVLX-NEXT: .Lcfi1403: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -38092,11 +39507,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -38108,15 +39523,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -38132,6 +39543,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -38154,12 +39566,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1174: +; NoVLX-NEXT: .Lcfi1404: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1175: +; NoVLX-NEXT: .Lcfi1405: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1176: +; NoVLX-NEXT: .Lcfi1406: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -38168,15 +39580,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1177: +; NoVLX-NEXT: .Lcfi1407: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1178: +; NoVLX-NEXT: .Lcfi1408: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1179: +; NoVLX-NEXT: .Lcfi1409: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1180: +; NoVLX-NEXT: .Lcfi1410: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1181: +; NoVLX-NEXT: .Lcfi1411: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -38186,6 +39598,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -38228,11 +39644,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -38244,15 +39660,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -38268,6 +39680,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -38291,12 +39704,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1182: +; NoVLX-NEXT: .Lcfi1412: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1183: +; NoVLX-NEXT: .Lcfi1413: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1184: +; NoVLX-NEXT: .Lcfi1414: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -38305,24 +39718,28 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1185: +; NoVLX-NEXT: .Lcfi1415: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1186: +; NoVLX-NEXT: .Lcfi1416: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1187: +; NoVLX-NEXT: .Lcfi1417: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1188: +; NoVLX-NEXT: .Lcfi1418: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1189: +; NoVLX-NEXT: .Lcfi1419: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -38365,11 +39782,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -38381,15 +39798,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -38405,6 +39818,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> @@ -38430,12 +39844,12 @@ ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1190: +; NoVLX-NEXT: .Lcfi1420: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1191: +; NoVLX-NEXT: .Lcfi1421: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1192: +; NoVLX-NEXT: .Lcfi1422: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -38461,6 +39875,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -38482,19 +39897,19 @@ ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1193: +; NoVLX-NEXT: .Lcfi1423: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1194: +; NoVLX-NEXT: .Lcfi1424: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1195: +; NoVLX-NEXT: .Lcfi1425: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm2 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -38513,6 +39928,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -38536,12 +39952,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1196: +; NoVLX-NEXT: .Lcfi1426: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1197: +; NoVLX-NEXT: .Lcfi1427: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1198: +; NoVLX-NEXT: .Lcfi1428: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -38576,6 +39992,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -38600,12 +40017,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1199: +; NoVLX-NEXT: .Lcfi1429: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1200: +; NoVLX-NEXT: .Lcfi1430: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1201: +; NoVLX-NEXT: .Lcfi1431: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -38640,6 +40057,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> @@ -38675,6 +40093,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -38706,6 +40125,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -38740,6 +40160,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -38775,6 +40196,7 @@ ; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -38799,12 +40221,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1202: +; NoVLX-NEXT: .Lcfi1432: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1203: +; NoVLX-NEXT: .Lcfi1433: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1204: +; NoVLX-NEXT: .Lcfi1434: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -38857,6 +40279,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -38877,12 +40300,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1205: +; NoVLX-NEXT: .Lcfi1435: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1206: +; NoVLX-NEXT: .Lcfi1436: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1207: +; NoVLX-NEXT: .Lcfi1437: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -38935,6 +40358,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -38957,12 +40381,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1208: +; NoVLX-NEXT: .Lcfi1438: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1209: +; NoVLX-NEXT: .Lcfi1439: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1210: +; NoVLX-NEXT: .Lcfi1440: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -39016,6 +40440,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -39039,12 +40464,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1211: +; NoVLX-NEXT: .Lcfi1441: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1212: +; NoVLX-NEXT: .Lcfi1442: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1213: +; NoVLX-NEXT: .Lcfi1443: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -39098,6 +40523,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -39122,12 +40548,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1214: +; NoVLX-NEXT: .Lcfi1444: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1215: +; NoVLX-NEXT: .Lcfi1445: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1216: +; NoVLX-NEXT: .Lcfi1446: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -39138,43 +40564,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -39185,6 +40611,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -39205,59 +40632,59 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1217: +; NoVLX-NEXT: .Lcfi1447: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1218: +; NoVLX-NEXT: .Lcfi1448: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1219: +; NoVLX-NEXT: .Lcfi1449: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -39268,6 +40695,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -39290,12 +40718,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1220: +; NoVLX-NEXT: .Lcfi1450: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1221: +; NoVLX-NEXT: .Lcfi1451: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1222: +; NoVLX-NEXT: .Lcfi1452: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -39307,43 +40735,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -39354,6 +40782,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -39377,60 +40806,60 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1223: +; NoVLX-NEXT: .Lcfi1453: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1224: +; NoVLX-NEXT: .Lcfi1454: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1225: +; NoVLX-NEXT: .Lcfi1455: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -39441,6 +40870,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> @@ -39466,15 +40896,30 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1226: +; NoVLX-NEXT: .Lcfi1456: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1227: +; NoVLX-NEXT: .Lcfi1457: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1228: +; NoVLX-NEXT: .Lcfi1458: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1459: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1460: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1461: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1462: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1463: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 @@ -39486,64 +40931,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -39552,8 +40997,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -39575,15 +41026,30 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1229: +; NoVLX-NEXT: .Lcfi1464: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1230: +; NoVLX-NEXT: .Lcfi1465: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1231: +; NoVLX-NEXT: .Lcfi1466: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1467: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1468: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1469: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1470: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1471: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 @@ -39595,64 +41061,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -39661,8 +41127,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -39686,15 +41158,30 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1232: +; NoVLX-NEXT: .Lcfi1472: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1233: +; NoVLX-NEXT: .Lcfi1473: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1234: +; NoVLX-NEXT: .Lcfi1474: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1475: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1476: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1477: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1478: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1479: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 @@ -39707,64 +41194,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -39773,8 +41260,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -39799,15 +41292,30 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1235: +; NoVLX-NEXT: .Lcfi1480: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1236: +; NoVLX-NEXT: .Lcfi1481: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1237: +; NoVLX-NEXT: .Lcfi1482: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1483: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1484: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1485: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1486: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1487: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 @@ -39820,64 +41328,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -39886,8 +41394,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -39913,12 +41427,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1238: +; NoVLX-NEXT: .Lcfi1488: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1239: +; NoVLX-NEXT: .Lcfi1489: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1240: +; NoVLX-NEXT: .Lcfi1490: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -39927,15 +41441,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1241: +; NoVLX-NEXT: .Lcfi1491: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1242: +; NoVLX-NEXT: .Lcfi1492: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1243: +; NoVLX-NEXT: .Lcfi1493: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1244: +; NoVLX-NEXT: .Lcfi1494: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1245: +; NoVLX-NEXT: .Lcfi1495: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -39944,6 +41458,10 @@ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -39986,11 +41504,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -40002,15 +41520,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -40026,6 +41540,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -40047,12 +41562,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1246: +; NoVLX-NEXT: .Lcfi1496: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1247: +; NoVLX-NEXT: .Lcfi1497: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1248: +; NoVLX-NEXT: .Lcfi1498: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -40061,23 +41576,27 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1249: +; NoVLX-NEXT: .Lcfi1499: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1250: +; NoVLX-NEXT: .Lcfi1500: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1251: +; NoVLX-NEXT: .Lcfi1501: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1252: +; NoVLX-NEXT: .Lcfi1502: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1253: +; NoVLX-NEXT: .Lcfi1503: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm2 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -40120,11 +41639,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -40136,15 +41655,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -40160,6 +41675,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -40183,12 +41699,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1254: +; NoVLX-NEXT: .Lcfi1504: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1255: +; NoVLX-NEXT: .Lcfi1505: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1256: +; NoVLX-NEXT: .Lcfi1506: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -40197,15 +41713,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1257: +; NoVLX-NEXT: .Lcfi1507: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1258: +; NoVLX-NEXT: .Lcfi1508: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1259: +; NoVLX-NEXT: .Lcfi1509: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1260: +; NoVLX-NEXT: .Lcfi1510: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1261: +; NoVLX-NEXT: .Lcfi1511: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -40215,6 +41731,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -40257,11 +41777,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -40273,15 +41793,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -40297,6 +41813,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -40321,12 +41838,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1262: +; NoVLX-NEXT: .Lcfi1512: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1263: +; NoVLX-NEXT: .Lcfi1513: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1264: +; NoVLX-NEXT: .Lcfi1514: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -40335,24 +41852,28 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1265: +; NoVLX-NEXT: .Lcfi1515: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1266: +; NoVLX-NEXT: .Lcfi1516: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1267: +; NoVLX-NEXT: .Lcfi1517: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1268: +; NoVLX-NEXT: .Lcfi1518: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1269: +; NoVLX-NEXT: .Lcfi1519: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -40395,11 +41916,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -40411,15 +41932,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -40435,6 +41952,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> @@ -40460,58 +41978,62 @@ ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1270: +; NoVLX-NEXT: .Lcfi1520: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1271: +; NoVLX-NEXT: .Lcfi1521: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1272: +; NoVLX-NEXT: .Lcfi1522: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm9 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm5 @@ -40519,72 +42041,79 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm6 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vmovd %eax, %xmm6 -; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm6, %xmm6 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $3, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm2 @@ -40592,7 +42121,7 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx @@ -40600,122 +42129,111 @@ ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm7 -; NoVLX-NEXT: vmovq %xmm7, %rax -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vmovd %eax, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rcx +; NoVLX-NEXT: vmovq %xmm8, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm2 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm1, %ymm4, %ymm3 -; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm3 +; NoVLX-NEXT: vpxor %ymm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm3 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovsxbd %xmm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpxor %ymm1, %ymm3, %ymm2 -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -40795,6 +42313,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -40816,177 +42335,177 @@ ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1273: +; NoVLX-NEXT: .Lcfi1523: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1274: +; NoVLX-NEXT: .Lcfi1524: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1275: +; NoVLX-NEXT: .Lcfi1525: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm1 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 -; NoVLX-NEXT: movl %edx, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %edx, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: movq %rdx, %rcx -; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $7, %edx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx -; NoVLX-NEXT: movl %ecx, %edx -; NoVLX-NEXT: shrl $16, %edx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rcx, %rdx -; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: movl %eax, %edx -; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rcx -; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $6, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: movl %ecx, %edx -; NoVLX-NEXT: shrl $16, %edx -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; NoVLX-NEXT: movq %rcx, %rdx -; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rdx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 -; NoVLX-NEXT: movl %edx, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rdx, %rcx +; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm0, %rcx -; NoVLX-NEXT: shrq $48, %rdx -; NoVLX-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3 -; NoVLX-NEXT: movl %ecx, %edx -; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; NoVLX-NEXT: movq %rcx, %rdx -; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rdx +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 -; NoVLX-NEXT: movl %edx, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: movq %rdx, %rcx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: shrq $48, %rdx -; NoVLX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm0, %ymm3, %ymm3 -; NoVLX-NEXT: vpxor 32(%rdi), %ymm0, %ymm4 -; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 -; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 -; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpxor 32(%rdi), %ymm1, %ymm3 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpxor %ymm0, %ymm1, %ymm1 -; NoVLX-NEXT: vpxor (%rdi), %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -41066,6 +42585,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -41089,12 +42609,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1276: +; NoVLX-NEXT: .Lcfi1526: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1277: +; NoVLX-NEXT: .Lcfi1527: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1278: +; NoVLX-NEXT: .Lcfi1528: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -41105,12 +42625,17 @@ ; NoVLX-NEXT: movq %rax, %rdx ; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 ; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -41118,10 +42643,9 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm4 @@ -41139,40 +42663,39 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm9 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 @@ -41180,170 +42703,171 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm5 -; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm6 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6 -; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6 -; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm7 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 -; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm6 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6 -; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm7 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm2 +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm8 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm4 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm4 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm3 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm5 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm4, %ymm1, %ymm1 -; NoVLX-NEXT: vpxor %ymm4, %ymm5, %ymm5 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm5, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm5, %ymm6, %ymm3 +; NoVLX-NEXT: vpxor %ymm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2 -; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vpxor %ymm5, %ymm8, %ymm2 +; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 @@ -41402,24 +42926,20 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpand %xmm1, %xmm2, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -41433,6 +42953,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -41457,12 +42978,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1279: +; NoVLX-NEXT: .Lcfi1529: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1280: +; NoVLX-NEXT: .Lcfi1530: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1281: +; NoVLX-NEXT: .Lcfi1531: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -41474,6 +42995,8 @@ ; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax @@ -41486,223 +43009,221 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm6 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm1 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; NoVLX-NEXT: vpxor (%rsi), %ymm3, %ymm4 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm4, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2 +; NoVLX-NEXT: vpxor (%rsi), %ymm4, %ymm5 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm5, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpxor 32(%rsi), %ymm3, %ymm4 -; NoVLX-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; NoVLX-NEXT: vpxor 32(%rsi), %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -41716,6 +43237,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> @@ -41760,8 +43282,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -41778,6 +43300,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -41799,9 +43322,9 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -41818,8 +43341,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -41836,6 +43359,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -41861,6 +43385,7 @@ ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -41872,14 +43397,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -41896,8 +43420,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -41914,6 +43438,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -41941,6 +43466,7 @@ ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -41952,14 +43478,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -41976,8 +43501,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -41994,6 +43519,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42040,8 +43566,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -42058,6 +43584,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42085,6 +43612,7 @@ ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -42096,14 +43624,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -42120,8 +43647,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -42138,6 +43665,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42183,8 +43711,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -42201,6 +43729,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42222,9 +43751,9 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -42240,8 +43769,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -42258,6 +43787,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42283,6 +43813,7 @@ ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -42294,14 +43825,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -42317,8 +43847,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -42335,6 +43865,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42362,6 +43893,7 @@ ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -42373,14 +43905,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -42396,8 +43927,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -42414,6 +43945,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42459,8 +43991,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -42477,6 +44009,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42504,6 +44037,7 @@ ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -42515,14 +44049,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -42538,8 +44071,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -42556,6 +44089,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42582,12 +44116,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1282: +; NoVLX-NEXT: .Lcfi1532: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1283: +; NoVLX-NEXT: .Lcfi1533: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1284: +; NoVLX-NEXT: .Lcfi1534: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -42608,6 +44142,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42628,19 +44163,19 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1285: +; NoVLX-NEXT: .Lcfi1535: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1286: +; NoVLX-NEXT: .Lcfi1536: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1287: +; NoVLX-NEXT: .Lcfi1537: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -42654,6 +44189,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42676,18 +44212,19 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1288: +; NoVLX-NEXT: .Lcfi1538: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1289: +; NoVLX-NEXT: .Lcfi1539: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1290: +; NoVLX-NEXT: .Lcfi1540: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -42699,14 +44236,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -42720,6 +44256,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42744,18 +44281,19 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1291: +; NoVLX-NEXT: .Lcfi1541: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1292: +; NoVLX-NEXT: .Lcfi1542: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1293: +; NoVLX-NEXT: .Lcfi1543: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -42767,14 +44305,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -42788,6 +44325,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42813,12 +44351,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1294: +; NoVLX-NEXT: .Lcfi1544: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1295: +; NoVLX-NEXT: .Lcfi1545: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1296: +; NoVLX-NEXT: .Lcfi1546: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -42840,6 +44378,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42863,12 +44402,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1297: +; NoVLX-NEXT: .Lcfi1547: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1298: +; NoVLX-NEXT: .Lcfi1548: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1299: +; NoVLX-NEXT: .Lcfi1549: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -42876,6 +44415,7 @@ ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -42887,14 +44427,13 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -42908,6 +44447,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42934,12 +44474,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1300: +; NoVLX-NEXT: .Lcfi1550: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1301: +; NoVLX-NEXT: .Lcfi1551: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1302: +; NoVLX-NEXT: .Lcfi1552: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -42949,8 +44489,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -42966,6 +44506,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -42986,12 +44527,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1303: +; NoVLX-NEXT: .Lcfi1553: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1304: +; NoVLX-NEXT: .Lcfi1554: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1305: +; NoVLX-NEXT: .Lcfi1555: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -43001,8 +44542,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -43018,6 +44559,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -43040,12 +44582,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1306: +; NoVLX-NEXT: .Lcfi1556: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1307: +; NoVLX-NEXT: .Lcfi1557: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1308: +; NoVLX-NEXT: .Lcfi1558: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -43055,6 +44597,7 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -43067,14 +44610,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -43090,6 +44632,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -43114,12 +44657,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1309: +; NoVLX-NEXT: .Lcfi1559: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1310: +; NoVLX-NEXT: .Lcfi1560: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1311: +; NoVLX-NEXT: .Lcfi1561: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -43129,6 +44672,7 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -43141,14 +44685,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -43164,6 +44707,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -43189,12 +44733,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1312: +; NoVLX-NEXT: .Lcfi1562: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1313: +; NoVLX-NEXT: .Lcfi1563: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1314: +; NoVLX-NEXT: .Lcfi1564: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -43205,8 +44749,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -43222,6 +44766,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -43245,12 +44790,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1315: +; NoVLX-NEXT: .Lcfi1565: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1316: +; NoVLX-NEXT: .Lcfi1566: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1317: +; NoVLX-NEXT: .Lcfi1567: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -43261,6 +44806,7 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -43273,14 +44819,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -43296,6 +44841,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> @@ -43330,6 +44876,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43358,6 +44905,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43389,6 +44937,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43421,6 +44970,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43453,6 +45003,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43485,6 +45036,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43511,12 +45063,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1318: +; NoVLX-NEXT: .Lcfi1568: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1319: +; NoVLX-NEXT: .Lcfi1569: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1320: +; NoVLX-NEXT: .Lcfi1570: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43565,6 +45117,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43586,12 +45139,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1321: +; NoVLX-NEXT: .Lcfi1571: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1322: +; NoVLX-NEXT: .Lcfi1572: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1323: +; NoVLX-NEXT: .Lcfi1573: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43640,6 +45193,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43663,12 +45217,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1324: +; NoVLX-NEXT: .Lcfi1574: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1325: +; NoVLX-NEXT: .Lcfi1575: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1326: +; NoVLX-NEXT: .Lcfi1576: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43719,6 +45273,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43743,12 +45298,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1327: +; NoVLX-NEXT: .Lcfi1577: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1328: +; NoVLX-NEXT: .Lcfi1578: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1329: +; NoVLX-NEXT: .Lcfi1579: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43799,6 +45354,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43824,12 +45380,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1330: +; NoVLX-NEXT: .Lcfi1580: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1331: +; NoVLX-NEXT: .Lcfi1581: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1332: +; NoVLX-NEXT: .Lcfi1582: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43878,6 +45434,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43902,12 +45459,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1333: +; NoVLX-NEXT: .Lcfi1583: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1334: +; NoVLX-NEXT: .Lcfi1584: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1335: +; NoVLX-NEXT: .Lcfi1585: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43958,6 +45515,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -43984,55 +45542,55 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1336: +; NoVLX-NEXT: .Lcfi1586: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1337: +; NoVLX-NEXT: .Lcfi1587: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1338: +; NoVLX-NEXT: .Lcfi1588: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44043,6 +45601,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -44064,55 +45623,55 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1339: +; NoVLX-NEXT: .Lcfi1589: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1340: +; NoVLX-NEXT: .Lcfi1590: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1341: +; NoVLX-NEXT: .Lcfi1591: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44123,6 +45682,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -44146,12 +45706,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1342: +; NoVLX-NEXT: .Lcfi1592: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1343: +; NoVLX-NEXT: .Lcfi1593: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1344: +; NoVLX-NEXT: .Lcfi1594: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -44160,43 +45720,43 @@ ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44207,6 +45767,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -44231,12 +45792,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1345: +; NoVLX-NEXT: .Lcfi1595: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1346: +; NoVLX-NEXT: .Lcfi1596: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1347: +; NoVLX-NEXT: .Lcfi1597: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -44245,43 +45806,43 @@ ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44292,6 +45853,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -44317,55 +45879,55 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1348: +; NoVLX-NEXT: .Lcfi1598: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1349: +; NoVLX-NEXT: .Lcfi1599: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1350: +; NoVLX-NEXT: .Lcfi1600: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44376,6 +45938,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -44400,12 +45963,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1351: +; NoVLX-NEXT: .Lcfi1601: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1352: +; NoVLX-NEXT: .Lcfi1602: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1353: +; NoVLX-NEXT: .Lcfi1603: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -44414,43 +45977,43 @@ ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44461,6 +46024,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -44487,78 +46051,93 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1354: +; NoVLX-NEXT: .Lcfi1604: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1355: +; NoVLX-NEXT: .Lcfi1605: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1356: +; NoVLX-NEXT: .Lcfi1606: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1607: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1608: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1609: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1610: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1611: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -44567,8 +46146,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -44590,78 +46175,93 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1357: +; NoVLX-NEXT: .Lcfi1612: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1358: +; NoVLX-NEXT: .Lcfi1613: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1359: +; NoVLX-NEXT: .Lcfi1614: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1615: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1616: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1617: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1618: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1619: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -44670,8 +46270,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -44695,79 +46301,94 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1360: +; NoVLX-NEXT: .Lcfi1620: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1361: +; NoVLX-NEXT: .Lcfi1621: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1362: +; NoVLX-NEXT: .Lcfi1622: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1623: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1624: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1625: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1626: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1627: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -44776,8 +46397,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -44802,79 +46429,94 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1363: +; NoVLX-NEXT: .Lcfi1628: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1364: +; NoVLX-NEXT: .Lcfi1629: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1365: +; NoVLX-NEXT: .Lcfi1630: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1631: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1632: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1633: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1634: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1635: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -44883,8 +46525,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -44910,78 +46558,93 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1366: +; NoVLX-NEXT: .Lcfi1636: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1367: +; NoVLX-NEXT: .Lcfi1637: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1368: +; NoVLX-NEXT: .Lcfi1638: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1639: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1640: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1641: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1642: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1643: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -44990,8 +46653,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -45016,79 +46685,94 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1369: +; NoVLX-NEXT: .Lcfi1644: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1370: +; NoVLX-NEXT: .Lcfi1645: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1371: +; NoVLX-NEXT: .Lcfi1646: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1647: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1648: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1649: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1650: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1651: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -45097,8 +46781,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -45125,12 +46815,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1372: +; NoVLX-NEXT: .Lcfi1652: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1373: +; NoVLX-NEXT: .Lcfi1653: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1374: +; NoVLX-NEXT: .Lcfi1654: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45139,17 +46829,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1375: +; NoVLX-NEXT: .Lcfi1655: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1376: +; NoVLX-NEXT: .Lcfi1656: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1377: +; NoVLX-NEXT: .Lcfi1657: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1378: +; NoVLX-NEXT: .Lcfi1658: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1379: +; NoVLX-NEXT: .Lcfi1659: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45192,11 +46886,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45208,15 +46902,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -45232,6 +46922,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -45253,12 +46944,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1380: +; NoVLX-NEXT: .Lcfi1660: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1381: +; NoVLX-NEXT: .Lcfi1661: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1382: +; NoVLX-NEXT: .Lcfi1662: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45267,17 +46958,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1383: +; NoVLX-NEXT: .Lcfi1663: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1384: +; NoVLX-NEXT: .Lcfi1664: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1385: +; NoVLX-NEXT: .Lcfi1665: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1386: +; NoVLX-NEXT: .Lcfi1666: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1387: +; NoVLX-NEXT: .Lcfi1667: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45320,11 +47015,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45336,15 +47031,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -45360,6 +47051,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -45383,12 +47075,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1388: +; NoVLX-NEXT: .Lcfi1668: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1389: +; NoVLX-NEXT: .Lcfi1669: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1390: +; NoVLX-NEXT: .Lcfi1670: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45397,18 +47089,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1391: +; NoVLX-NEXT: .Lcfi1671: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1392: +; NoVLX-NEXT: .Lcfi1672: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1393: +; NoVLX-NEXT: .Lcfi1673: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1394: +; NoVLX-NEXT: .Lcfi1674: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1395: +; NoVLX-NEXT: .Lcfi1675: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45451,11 +47147,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45467,15 +47163,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -45491,6 +47183,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -45515,12 +47208,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1396: +; NoVLX-NEXT: .Lcfi1676: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1397: +; NoVLX-NEXT: .Lcfi1677: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1398: +; NoVLX-NEXT: .Lcfi1678: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45529,18 +47222,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1399: +; NoVLX-NEXT: .Lcfi1679: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1400: +; NoVLX-NEXT: .Lcfi1680: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1401: +; NoVLX-NEXT: .Lcfi1681: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1402: +; NoVLX-NEXT: .Lcfi1682: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1403: +; NoVLX-NEXT: .Lcfi1683: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45583,11 +47280,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45599,15 +47296,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -45623,6 +47316,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -45648,12 +47342,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1404: +; NoVLX-NEXT: .Lcfi1684: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1405: +; NoVLX-NEXT: .Lcfi1685: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1406: +; NoVLX-NEXT: .Lcfi1686: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45662,17 +47356,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1407: +; NoVLX-NEXT: .Lcfi1687: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1408: +; NoVLX-NEXT: .Lcfi1688: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1409: +; NoVLX-NEXT: .Lcfi1689: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1410: +; NoVLX-NEXT: .Lcfi1690: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1411: +; NoVLX-NEXT: .Lcfi1691: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45715,11 +47413,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45731,15 +47429,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -45755,6 +47449,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -45779,12 +47474,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1412: +; NoVLX-NEXT: .Lcfi1692: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1413: +; NoVLX-NEXT: .Lcfi1693: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1414: +; NoVLX-NEXT: .Lcfi1694: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45793,18 +47488,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1415: +; NoVLX-NEXT: .Lcfi1695: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1416: +; NoVLX-NEXT: .Lcfi1696: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1417: +; NoVLX-NEXT: .Lcfi1697: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1418: +; NoVLX-NEXT: .Lcfi1698: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1419: +; NoVLX-NEXT: .Lcfi1699: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45847,11 +47546,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45863,15 +47562,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -45887,6 +47582,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> @@ -45922,6 +47618,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -45943,15 +47640,16 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -45977,6 +47675,7 @@ ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -45984,16 +47683,16 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46021,6 +47720,7 @@ ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -46028,16 +47728,16 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46074,6 +47774,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46101,6 +47802,7 @@ ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -46108,16 +47810,16 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46166,6 +47868,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46208,6 +47911,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46262,6 +47966,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46287,9 +47992,9 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -46318,6 +48023,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46366,6 +48072,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46422,6 +48129,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46469,6 +48177,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46510,6 +48219,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46563,6 +48273,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46588,9 +48299,9 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -46618,6 +48329,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46665,6 +48377,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46720,6 +48433,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46746,12 +48460,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1420: +; NoVLX-NEXT: .Lcfi1700: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1421: +; NoVLX-NEXT: .Lcfi1701: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1422: +; NoVLX-NEXT: .Lcfi1702: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -46772,6 +48486,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46792,19 +48507,19 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1423: +; NoVLX-NEXT: .Lcfi1703: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1424: +; NoVLX-NEXT: .Lcfi1704: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1425: +; NoVLX-NEXT: .Lcfi1705: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -46818,6 +48533,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46840,18 +48556,19 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1426: +; NoVLX-NEXT: .Lcfi1706: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1427: +; NoVLX-NEXT: .Lcfi1707: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1428: +; NoVLX-NEXT: .Lcfi1708: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -46859,10 +48576,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -46876,6 +48592,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46900,18 +48617,19 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1429: +; NoVLX-NEXT: .Lcfi1709: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1430: +; NoVLX-NEXT: .Lcfi1710: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1431: +; NoVLX-NEXT: .Lcfi1711: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -46919,10 +48637,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -46936,6 +48653,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -46961,12 +48679,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1432: +; NoVLX-NEXT: .Lcfi1712: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1433: +; NoVLX-NEXT: .Lcfi1713: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1434: +; NoVLX-NEXT: .Lcfi1714: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -46988,6 +48706,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -47011,12 +48730,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1435: +; NoVLX-NEXT: .Lcfi1715: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1436: +; NoVLX-NEXT: .Lcfi1716: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1437: +; NoVLX-NEXT: .Lcfi1717: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -47024,6 +48743,7 @@ ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -47031,10 +48751,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -47048,6 +48767,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -47074,12 +48794,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1438: +; NoVLX-NEXT: .Lcfi1718: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1439: +; NoVLX-NEXT: .Lcfi1719: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1440: +; NoVLX-NEXT: .Lcfi1720: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47089,8 +48809,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47106,6 +48826,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -47126,12 +48847,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1441: +; NoVLX-NEXT: .Lcfi1721: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1442: +; NoVLX-NEXT: .Lcfi1722: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1443: +; NoVLX-NEXT: .Lcfi1723: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47141,8 +48862,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47158,6 +48879,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -47180,12 +48902,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1444: +; NoVLX-NEXT: .Lcfi1724: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1445: +; NoVLX-NEXT: .Lcfi1725: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1446: +; NoVLX-NEXT: .Lcfi1726: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47205,8 +48927,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47222,6 +48944,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -47246,12 +48969,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1447: +; NoVLX-NEXT: .Lcfi1727: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1448: +; NoVLX-NEXT: .Lcfi1728: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1449: +; NoVLX-NEXT: .Lcfi1729: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47271,8 +48994,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47288,6 +49011,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -47313,12 +49037,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1450: +; NoVLX-NEXT: .Lcfi1730: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1451: +; NoVLX-NEXT: .Lcfi1731: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1452: +; NoVLX-NEXT: .Lcfi1732: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47329,8 +49053,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47346,6 +49070,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -47369,12 +49094,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1453: +; NoVLX-NEXT: .Lcfi1733: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1454: +; NoVLX-NEXT: .Lcfi1734: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1455: +; NoVLX-NEXT: .Lcfi1735: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47395,8 +49120,8 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47412,6 +49137,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> @@ -47460,8 +49186,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -47478,6 +49204,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -47520,8 +49247,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -47538,6 +49265,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -47565,6 +49293,7 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -47582,7 +49311,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -47600,8 +49328,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -47618,6 +49346,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -47644,9 +49373,10 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -47664,7 +49394,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -47682,8 +49411,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -47700,6 +49429,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -47748,8 +49478,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -47766,6 +49496,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -47795,6 +49526,7 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -47812,7 +49544,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -47830,8 +49561,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -47848,6 +49579,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -47895,8 +49627,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -47913,6 +49645,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -47954,8 +49687,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -47972,6 +49705,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -47999,6 +49733,7 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -48016,7 +49751,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -48033,8 +49767,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -48051,6 +49785,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48077,9 +49812,10 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -48097,7 +49833,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -48114,8 +49849,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -48132,6 +49867,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48179,8 +49915,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -48197,6 +49933,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48226,6 +49963,7 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -48243,7 +49981,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -48260,8 +49997,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -48278,6 +50015,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48305,12 +50043,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1456: +; NoVLX-NEXT: .Lcfi1736: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1457: +; NoVLX-NEXT: .Lcfi1737: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1458: +; NoVLX-NEXT: .Lcfi1738: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48332,6 +50070,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48353,12 +50092,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1459: +; NoVLX-NEXT: .Lcfi1739: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1460: +; NoVLX-NEXT: .Lcfi1740: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1461: +; NoVLX-NEXT: .Lcfi1741: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48380,6 +50119,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48403,12 +50143,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1462: +; NoVLX-NEXT: .Lcfi1742: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1463: +; NoVLX-NEXT: .Lcfi1743: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1464: +; NoVLX-NEXT: .Lcfi1744: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48416,6 +50156,7 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -48433,7 +50174,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -48448,6 +50188,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48473,19 +50214,20 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1465: +; NoVLX-NEXT: .Lcfi1745: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1466: +; NoVLX-NEXT: .Lcfi1746: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1467: +; NoVLX-NEXT: .Lcfi1747: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -48503,7 +50245,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -48518,6 +50259,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48544,12 +50286,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1468: +; NoVLX-NEXT: .Lcfi1748: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1469: +; NoVLX-NEXT: .Lcfi1749: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1470: +; NoVLX-NEXT: .Lcfi1750: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48572,6 +50314,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48596,12 +50339,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1471: +; NoVLX-NEXT: .Lcfi1751: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1472: +; NoVLX-NEXT: .Lcfi1752: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1473: +; NoVLX-NEXT: .Lcfi1753: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48610,6 +50353,7 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -48627,7 +50371,6 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -48642,6 +50385,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48669,12 +50413,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1474: +; NoVLX-NEXT: .Lcfi1754: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1475: +; NoVLX-NEXT: .Lcfi1755: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1476: +; NoVLX-NEXT: .Lcfi1756: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48685,8 +50429,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -48702,6 +50446,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48723,12 +50468,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1477: +; NoVLX-NEXT: .Lcfi1757: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1478: +; NoVLX-NEXT: .Lcfi1758: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1479: +; NoVLX-NEXT: .Lcfi1759: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48739,8 +50484,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -48756,6 +50501,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48779,12 +50525,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1480: +; NoVLX-NEXT: .Lcfi1760: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1481: +; NoVLX-NEXT: .Lcfi1761: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1482: +; NoVLX-NEXT: .Lcfi1762: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48795,6 +50541,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -48807,14 +50554,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -48830,6 +50576,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48855,12 +50602,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1483: +; NoVLX-NEXT: .Lcfi1763: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1484: +; NoVLX-NEXT: .Lcfi1764: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1485: +; NoVLX-NEXT: .Lcfi1765: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48871,6 +50618,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -48883,14 +50631,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -48906,6 +50653,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48932,12 +50680,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1486: +; NoVLX-NEXT: .Lcfi1766: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1487: +; NoVLX-NEXT: .Lcfi1767: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1488: +; NoVLX-NEXT: .Lcfi1768: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48949,8 +50697,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -48966,6 +50714,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -48990,12 +50739,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1489: +; NoVLX-NEXT: .Lcfi1769: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1490: +; NoVLX-NEXT: .Lcfi1770: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1491: +; NoVLX-NEXT: .Lcfi1771: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -49007,6 +50756,7 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -49019,14 +50769,13 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -49042,6 +50791,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> @@ -49072,6 +50822,7 @@ ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49096,6 +50847,7 @@ ; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49123,6 +50875,7 @@ ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49151,6 +50904,7 @@ ; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49179,6 +50933,7 @@ ; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49207,6 +50962,7 @@ ; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49233,12 +50989,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1492: +; NoVLX-NEXT: .Lcfi1772: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1493: +; NoVLX-NEXT: .Lcfi1773: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1494: +; NoVLX-NEXT: .Lcfi1774: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49285,6 +51041,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49306,12 +51063,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1495: +; NoVLX-NEXT: .Lcfi1775: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1496: +; NoVLX-NEXT: .Lcfi1776: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1497: +; NoVLX-NEXT: .Lcfi1777: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49358,6 +51115,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49381,12 +51139,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1498: +; NoVLX-NEXT: .Lcfi1778: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1499: +; NoVLX-NEXT: .Lcfi1779: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1500: +; NoVLX-NEXT: .Lcfi1780: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49434,6 +51192,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49458,12 +51217,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1501: +; NoVLX-NEXT: .Lcfi1781: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1502: +; NoVLX-NEXT: .Lcfi1782: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1503: +; NoVLX-NEXT: .Lcfi1783: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49511,6 +51270,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49536,12 +51296,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1504: +; NoVLX-NEXT: .Lcfi1784: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1505: +; NoVLX-NEXT: .Lcfi1785: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1506: +; NoVLX-NEXT: .Lcfi1786: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49588,6 +51348,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49612,12 +51373,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1507: +; NoVLX-NEXT: .Lcfi1787: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1508: +; NoVLX-NEXT: .Lcfi1788: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1509: +; NoVLX-NEXT: .Lcfi1789: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49665,6 +51426,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49691,53 +51453,53 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1510: +; NoVLX-NEXT: .Lcfi1790: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1511: +; NoVLX-NEXT: .Lcfi1791: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1512: +; NoVLX-NEXT: .Lcfi1792: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49748,6 +51510,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49769,53 +51532,53 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1513: +; NoVLX-NEXT: .Lcfi1793: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1514: +; NoVLX-NEXT: .Lcfi1794: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1515: +; NoVLX-NEXT: .Lcfi1795: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49826,6 +51589,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49849,54 +51613,54 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1516: +; NoVLX-NEXT: .Lcfi1796: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1517: +; NoVLX-NEXT: .Lcfi1797: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1518: +; NoVLX-NEXT: .Lcfi1798: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49907,6 +51671,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -49931,54 +51696,54 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1519: +; NoVLX-NEXT: .Lcfi1799: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1520: +; NoVLX-NEXT: .Lcfi1800: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1521: +; NoVLX-NEXT: .Lcfi1801: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49989,6 +51754,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -50014,53 +51780,53 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1522: +; NoVLX-NEXT: .Lcfi1802: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1523: +; NoVLX-NEXT: .Lcfi1803: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1524: +; NoVLX-NEXT: .Lcfi1804: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -50071,6 +51837,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -50095,54 +51862,54 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1525: +; NoVLX-NEXT: .Lcfi1805: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1526: +; NoVLX-NEXT: .Lcfi1806: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1527: +; NoVLX-NEXT: .Lcfi1807: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -50153,6 +51920,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> @@ -50196,8 +51964,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -50214,6 +51982,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50251,8 +52020,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -50269,6 +52038,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50308,8 +52078,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -50326,6 +52096,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50373,8 +52144,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -50391,6 +52162,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50419,8 +52191,8 @@ ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -50438,8 +52210,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -50456,6 +52228,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50505,8 +52278,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -50523,6 +52296,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50565,8 +52339,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -50583,6 +52357,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50619,8 +52394,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -50637,6 +52412,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50675,8 +52451,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -50693,6 +52469,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50739,8 +52516,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -50757,6 +52534,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50785,8 +52563,8 @@ ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -50803,8 +52581,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -50821,6 +52599,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50869,8 +52648,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -50887,6 +52666,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50913,12 +52693,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1528: +; NoVLX-NEXT: .Lcfi1808: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1529: +; NoVLX-NEXT: .Lcfi1809: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1530: +; NoVLX-NEXT: .Lcfi1810: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -50936,6 +52716,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -50956,12 +52737,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1531: +; NoVLX-NEXT: .Lcfi1811: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1532: +; NoVLX-NEXT: .Lcfi1812: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1533: +; NoVLX-NEXT: .Lcfi1813: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -50979,6 +52760,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -51000,12 +52782,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1534: +; NoVLX-NEXT: .Lcfi1814: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1535: +; NoVLX-NEXT: .Lcfi1815: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1536: +; NoVLX-NEXT: .Lcfi1816: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -51024,6 +52806,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -51048,12 +52831,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1537: +; NoVLX-NEXT: .Lcfi1817: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1538: +; NoVLX-NEXT: .Lcfi1818: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1539: +; NoVLX-NEXT: .Lcfi1819: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51077,6 +52860,7 @@ ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -51101,12 +52885,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1540: +; NoVLX-NEXT: .Lcfi1820: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1541: +; NoVLX-NEXT: .Lcfi1821: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1542: +; NoVLX-NEXT: .Lcfi1822: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51114,8 +52898,8 @@ ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -51130,6 +52914,7 @@ ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -51155,12 +52940,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1543: +; NoVLX-NEXT: .Lcfi1823: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1544: +; NoVLX-NEXT: .Lcfi1824: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1545: +; NoVLX-NEXT: .Lcfi1825: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51185,6 +52970,7 @@ ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -51211,20 +52997,20 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1546: +; NoVLX-NEXT: .Lcfi1826: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1547: +; NoVLX-NEXT: .Lcfi1827: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1548: +; NoVLX-NEXT: .Lcfi1828: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51240,6 +53026,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -51260,20 +53047,20 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1549: +; NoVLX-NEXT: .Lcfi1829: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1550: +; NoVLX-NEXT: .Lcfi1830: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1551: +; NoVLX-NEXT: .Lcfi1831: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51289,6 +53076,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -51310,12 +53098,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1552: +; NoVLX-NEXT: .Lcfi1832: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1553: +; NoVLX-NEXT: .Lcfi1833: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1554: +; NoVLX-NEXT: .Lcfi1834: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51323,8 +53111,8 @@ ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51340,6 +53128,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -51364,12 +53153,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1555: +; NoVLX-NEXT: .Lcfi1835: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1556: +; NoVLX-NEXT: .Lcfi1836: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1557: +; NoVLX-NEXT: .Lcfi1837: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -51382,8 +53171,8 @@ ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51399,6 +53188,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -51423,12 +53213,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1558: +; NoVLX-NEXT: .Lcfi1838: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1559: +; NoVLX-NEXT: .Lcfi1839: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1560: +; NoVLX-NEXT: .Lcfi1840: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -51441,8 +53231,8 @@ ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51458,6 +53248,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -51483,12 +53274,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1561: +; NoVLX-NEXT: .Lcfi1841: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1562: +; NoVLX-NEXT: .Lcfi1842: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1563: +; NoVLX-NEXT: .Lcfi1843: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -51502,8 +53293,8 @@ ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51519,6 +53310,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> @@ -51553,6 +53345,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -51581,6 +53374,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -51610,6 +53404,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -51642,6 +53437,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -51674,6 +53470,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -51707,6 +53504,7 @@ ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -51734,12 +53532,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1564: +; NoVLX-NEXT: .Lcfi1844: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1565: +; NoVLX-NEXT: .Lcfi1845: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1566: +; NoVLX-NEXT: .Lcfi1846: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -51788,6 +53586,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -51809,12 +53608,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1567: +; NoVLX-NEXT: .Lcfi1847: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1568: +; NoVLX-NEXT: .Lcfi1848: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1569: +; NoVLX-NEXT: .Lcfi1849: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -51863,6 +53662,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -51885,12 +53685,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1570: +; NoVLX-NEXT: .Lcfi1850: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1571: +; NoVLX-NEXT: .Lcfi1851: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1572: +; NoVLX-NEXT: .Lcfi1852: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -51939,6 +53739,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -51963,12 +53764,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1573: +; NoVLX-NEXT: .Lcfi1853: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1574: +; NoVLX-NEXT: .Lcfi1854: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1575: +; NoVLX-NEXT: .Lcfi1855: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -52019,6 +53820,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -52043,12 +53845,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1576: +; NoVLX-NEXT: .Lcfi1856: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1577: +; NoVLX-NEXT: .Lcfi1857: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1578: +; NoVLX-NEXT: .Lcfi1858: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -52099,6 +53901,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -52124,12 +53927,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1579: +; NoVLX-NEXT: .Lcfi1859: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1580: +; NoVLX-NEXT: .Lcfi1860: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1581: +; NoVLX-NEXT: .Lcfi1861: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -52180,6 +53983,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -52207,55 +54011,55 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1582: +; NoVLX-NEXT: .Lcfi1862: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1583: +; NoVLX-NEXT: .Lcfi1863: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1584: +; NoVLX-NEXT: .Lcfi1864: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52266,6 +54070,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -52287,55 +54092,55 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1585: +; NoVLX-NEXT: .Lcfi1865: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1586: +; NoVLX-NEXT: .Lcfi1866: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1587: +; NoVLX-NEXT: .Lcfi1867: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovaps (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52346,6 +54151,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -52368,55 +54174,55 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1588: +; NoVLX-NEXT: .Lcfi1868: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1589: +; NoVLX-NEXT: .Lcfi1869: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1590: +; NoVLX-NEXT: .Lcfi1870: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52427,6 +54233,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -52451,12 +54258,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1591: +; NoVLX-NEXT: .Lcfi1871: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1592: +; NoVLX-NEXT: .Lcfi1872: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1593: +; NoVLX-NEXT: .Lcfi1873: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52465,43 +54272,43 @@ ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52512,6 +54319,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -52536,12 +54344,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1594: +; NoVLX-NEXT: .Lcfi1874: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1595: +; NoVLX-NEXT: .Lcfi1875: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1596: +; NoVLX-NEXT: .Lcfi1876: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52550,43 +54358,43 @@ ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52597,6 +54405,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -52622,12 +54431,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1597: +; NoVLX-NEXT: .Lcfi1877: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1598: +; NoVLX-NEXT: .Lcfi1878: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1599: +; NoVLX-NEXT: .Lcfi1879: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52636,43 +54445,43 @@ ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52683,6 +54492,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -52710,78 +54520,93 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1600: +; NoVLX-NEXT: .Lcfi1880: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1601: +; NoVLX-NEXT: .Lcfi1881: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1602: +; NoVLX-NEXT: .Lcfi1882: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1883: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1884: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1885: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1886: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1887: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -52790,8 +54615,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -52813,78 +54644,93 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1603: +; NoVLX-NEXT: .Lcfi1888: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1604: +; NoVLX-NEXT: .Lcfi1889: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1605: +; NoVLX-NEXT: .Lcfi1890: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1891: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1892: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1893: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1894: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1895: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -52893,8 +54739,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -52917,78 +54769,93 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1606: +; NoVLX-NEXT: .Lcfi1896: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1607: +; NoVLX-NEXT: .Lcfi1897: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1608: +; NoVLX-NEXT: .Lcfi1898: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1899: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1900: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1901: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1902: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1903: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -52997,8 +54864,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -53023,79 +54896,94 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1609: +; NoVLX-NEXT: .Lcfi1904: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1610: +; NoVLX-NEXT: .Lcfi1905: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1611: +; NoVLX-NEXT: .Lcfi1906: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1907: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1908: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1909: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1910: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1911: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -53104,8 +54992,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -53130,79 +55024,94 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1612: +; NoVLX-NEXT: .Lcfi1912: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1613: +; NoVLX-NEXT: .Lcfi1913: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1614: +; NoVLX-NEXT: .Lcfi1914: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1915: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1916: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1917: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1918: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1919: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -53211,8 +55120,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -53238,79 +55153,94 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1615: +; NoVLX-NEXT: .Lcfi1920: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1616: +; NoVLX-NEXT: .Lcfi1921: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1617: +; NoVLX-NEXT: .Lcfi1922: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1923: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1924: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1925: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1926: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1927: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -53319,8 +55249,14 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -53338,18 +55274,12 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: -; VLX: # BB#0: # %entry -; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 -; VLX-NEXT: kmovw %k0, %eax -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: retq +; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> @@ -53372,6 +55302,7 @@ ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -53394,12 +55325,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1618: +; NoVLX-NEXT: .Lcfi1928: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1619: +; NoVLX-NEXT: .Lcfi1929: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1620: +; NoVLX-NEXT: .Lcfi1930: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53408,17 +55339,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1621: +; NoVLX-NEXT: .Lcfi1931: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1622: +; NoVLX-NEXT: .Lcfi1932: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1623: +; NoVLX-NEXT: .Lcfi1933: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1624: +; NoVLX-NEXT: .Lcfi1934: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1625: +; NoVLX-NEXT: .Lcfi1935: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -53461,11 +55396,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -53477,15 +55412,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -53501,6 +55432,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -53522,12 +55454,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1626: +; NoVLX-NEXT: .Lcfi1936: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1627: +; NoVLX-NEXT: .Lcfi1937: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1628: +; NoVLX-NEXT: .Lcfi1938: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53536,17 +55468,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1629: +; NoVLX-NEXT: .Lcfi1939: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1630: +; NoVLX-NEXT: .Lcfi1940: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1631: +; NoVLX-NEXT: .Lcfi1941: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1632: +; NoVLX-NEXT: .Lcfi1942: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1633: +; NoVLX-NEXT: .Lcfi1943: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -53589,11 +55525,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -53605,15 +55541,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -53629,6 +55561,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -53651,12 +55584,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1634: +; NoVLX-NEXT: .Lcfi1944: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1635: +; NoVLX-NEXT: .Lcfi1945: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1636: +; NoVLX-NEXT: .Lcfi1946: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53665,17 +55598,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1637: +; NoVLX-NEXT: .Lcfi1947: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1638: +; NoVLX-NEXT: .Lcfi1948: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1639: +; NoVLX-NEXT: .Lcfi1949: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1640: +; NoVLX-NEXT: .Lcfi1950: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1641: +; NoVLX-NEXT: .Lcfi1951: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -53718,11 +55655,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -53734,15 +55671,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -53758,6 +55691,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -53782,12 +55716,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1642: +; NoVLX-NEXT: .Lcfi1952: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1643: +; NoVLX-NEXT: .Lcfi1953: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1644: +; NoVLX-NEXT: .Lcfi1954: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53796,18 +55730,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1645: +; NoVLX-NEXT: .Lcfi1955: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1646: +; NoVLX-NEXT: .Lcfi1956: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1647: +; NoVLX-NEXT: .Lcfi1957: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1648: +; NoVLX-NEXT: .Lcfi1958: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1649: +; NoVLX-NEXT: .Lcfi1959: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -53850,11 +55788,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -53866,15 +55804,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -53890,6 +55824,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -53914,12 +55849,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1650: +; NoVLX-NEXT: .Lcfi1960: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1651: +; NoVLX-NEXT: .Lcfi1961: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1652: +; NoVLX-NEXT: .Lcfi1962: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53928,18 +55863,156 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1653: +; NoVLX-NEXT: .Lcfi1963: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1654: +; NoVLX-NEXT: .Lcfi1964: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1655: +; NoVLX-NEXT: .Lcfi1965: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1656: +; NoVLX-NEXT: .Lcfi1966: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1657: +; NoVLX-NEXT: .Lcfi1967: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, float* %__b) local_unnamed_addr { +; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1968: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1969: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1970: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1971: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1972: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1973: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1974: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1975: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -53982,144 +56055,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, (%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <8 x i64> %__a to <16 x float> - %load = load <8 x i64>, <8 x i64>* %__b - %1 = bitcast <8 x i64> %load to <16 x float> - %2 = fcmp oeq <16 x float> %0, %1 - %3 = bitcast i16 %__u to <16 x i1> - %4 = and <16 x i1> %2, %3 - %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> - %6 = bitcast <64 x i1> %5 to i64 - ret i64 %6 -} - -define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, float* %__b) local_unnamed_addr { -; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b: -; VLX: # BB#0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1} -; VLX-NEXT: kmovq %k0, %rax -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1658: -; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1659: -; NoVLX-NEXT: .cfi_offset %rbp, -16 -; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1660: -; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx -; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1661: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1662: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1663: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1664: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1665: -; NoVLX-NEXT: .cfi_offset %r15, -24 -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r10d -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d -; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d -; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi -; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx -; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi -; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx -; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -54131,15 +56071,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -54155,6 +56091,7 @@ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -54185,6 +56122,7 @@ ; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -54210,6 +56148,7 @@ ; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> @@ -54239,6 +56178,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54266,6 +56206,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54295,6 +56236,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54331,6 +56273,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54358,8 +56301,8 @@ ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 @@ -54367,6 +56310,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54405,6 +56349,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54450,6 +56395,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54489,6 +56435,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54530,6 +56477,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54578,6 +56526,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54626,6 +56575,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54676,6 +56626,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54720,6 +56671,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54758,6 +56710,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54798,6 +56751,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54845,6 +56799,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54892,6 +56847,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54941,6 +56897,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -54967,12 +56924,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1666: +; NoVLX-NEXT: .Lcfi1976: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1667: +; NoVLX-NEXT: .Lcfi1977: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1668: +; NoVLX-NEXT: .Lcfi1978: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -54990,6 +56947,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55010,12 +56968,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1669: +; NoVLX-NEXT: .Lcfi1979: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1670: +; NoVLX-NEXT: .Lcfi1980: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1671: +; NoVLX-NEXT: .Lcfi1981: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -55033,6 +56991,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55054,12 +57013,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1672: +; NoVLX-NEXT: .Lcfi1982: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1673: +; NoVLX-NEXT: .Lcfi1983: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1674: +; NoVLX-NEXT: .Lcfi1984: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -55078,6 +57037,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55102,12 +57062,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1675: +; NoVLX-NEXT: .Lcfi1985: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1676: +; NoVLX-NEXT: .Lcfi1986: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1677: +; NoVLX-NEXT: .Lcfi1987: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -55130,6 +57090,7 @@ ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55154,20 +57115,20 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1678: +; NoVLX-NEXT: .Lcfi1988: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1679: +; NoVLX-NEXT: .Lcfi1989: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1680: +; NoVLX-NEXT: .Lcfi1990: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -55182,6 +57143,7 @@ ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55207,12 +57169,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1681: +; NoVLX-NEXT: .Lcfi1991: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1682: +; NoVLX-NEXT: .Lcfi1992: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1683: +; NoVLX-NEXT: .Lcfi1993: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -55236,6 +57198,7 @@ ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55262,20 +57225,20 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1684: +; NoVLX-NEXT: .Lcfi1994: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1685: +; NoVLX-NEXT: .Lcfi1995: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1686: +; NoVLX-NEXT: .Lcfi1996: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55291,6 +57254,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55311,20 +57275,20 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1687: +; NoVLX-NEXT: .Lcfi1997: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1688: +; NoVLX-NEXT: .Lcfi1998: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1689: +; NoVLX-NEXT: .Lcfi1999: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55340,6 +57304,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55361,12 +57326,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1690: +; NoVLX-NEXT: .Lcfi2000: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1691: +; NoVLX-NEXT: .Lcfi2001: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1692: +; NoVLX-NEXT: .Lcfi2002: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -55374,8 +57339,8 @@ ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55391,6 +57356,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55415,12 +57381,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1693: +; NoVLX-NEXT: .Lcfi2003: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1694: +; NoVLX-NEXT: .Lcfi2004: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1695: +; NoVLX-NEXT: .Lcfi2005: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -55432,8 +57398,8 @@ ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55449,6 +57415,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55473,12 +57440,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1696: +; NoVLX-NEXT: .Lcfi2006: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1697: +; NoVLX-NEXT: .Lcfi2007: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1698: +; NoVLX-NEXT: .Lcfi2008: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -55490,8 +57457,8 @@ ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55507,6 +57474,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55532,12 +57500,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1699: +; NoVLX-NEXT: .Lcfi2009: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1700: +; NoVLX-NEXT: .Lcfi2010: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1701: +; NoVLX-NEXT: .Lcfi2011: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -55550,8 +57518,8 @@ ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55567,6 +57535,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> @@ -55612,8 +57581,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -55630,6 +57599,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -55669,8 +57639,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -55687,6 +57657,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -55728,8 +57699,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -55746,6 +57717,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -55795,8 +57767,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -55813,6 +57785,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -55862,8 +57835,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -55880,6 +57853,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -55931,8 +57905,8 @@ ; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] @@ -55949,6 +57923,7 @@ ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -55993,8 +57968,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -56011,6 +57986,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56049,8 +58025,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -56067,6 +58043,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56107,8 +58084,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -56125,6 +58102,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56173,8 +58151,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -56191,6 +58169,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56239,8 +58218,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -56257,6 +58236,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56307,8 +58287,8 @@ ; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] @@ -56325,6 +58305,7 @@ ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56352,12 +58333,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1702: +; NoVLX-NEXT: .Lcfi2012: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1703: +; NoVLX-NEXT: .Lcfi2013: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1704: +; NoVLX-NEXT: .Lcfi2014: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -56376,6 +58357,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56397,12 +58379,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1705: +; NoVLX-NEXT: .Lcfi2015: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1706: +; NoVLX-NEXT: .Lcfi2016: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1707: +; NoVLX-NEXT: .Lcfi2017: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -56421,6 +58403,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56443,12 +58426,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1708: +; NoVLX-NEXT: .Lcfi2018: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1709: +; NoVLX-NEXT: .Lcfi2019: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1710: +; NoVLX-NEXT: .Lcfi2020: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -56468,6 +58451,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56493,12 +58477,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1711: +; NoVLX-NEXT: .Lcfi2021: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1712: +; NoVLX-NEXT: .Lcfi2022: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1713: +; NoVLX-NEXT: .Lcfi2023: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56523,6 +58507,7 @@ ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56548,12 +58533,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1714: +; NoVLX-NEXT: .Lcfi2024: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1715: +; NoVLX-NEXT: .Lcfi2025: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1716: +; NoVLX-NEXT: .Lcfi2026: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56578,6 +58563,7 @@ ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56604,12 +58590,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1717: +; NoVLX-NEXT: .Lcfi2027: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1718: +; NoVLX-NEXT: .Lcfi2028: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1719: +; NoVLX-NEXT: .Lcfi2029: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56635,6 +58621,7 @@ ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56662,12 +58649,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1720: +; NoVLX-NEXT: .Lcfi2030: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1721: +; NoVLX-NEXT: .Lcfi2031: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1722: +; NoVLX-NEXT: .Lcfi2032: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56675,8 +58662,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -56692,6 +58679,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56713,12 +58701,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1723: +; NoVLX-NEXT: .Lcfi2033: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1724: +; NoVLX-NEXT: .Lcfi2034: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1725: +; NoVLX-NEXT: .Lcfi2035: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56726,8 +58714,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -56743,6 +58731,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56765,12 +58754,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1726: +; NoVLX-NEXT: .Lcfi2036: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1727: +; NoVLX-NEXT: .Lcfi2037: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1728: +; NoVLX-NEXT: .Lcfi2038: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56779,8 +58768,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -56796,6 +58785,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56821,12 +58811,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1729: +; NoVLX-NEXT: .Lcfi2039: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1730: +; NoVLX-NEXT: .Lcfi2040: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1731: +; NoVLX-NEXT: .Lcfi2041: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -56840,8 +58830,8 @@ ; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -56857,6 +58847,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56882,12 +58873,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1732: +; NoVLX-NEXT: .Lcfi2042: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1733: +; NoVLX-NEXT: .Lcfi2043: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1734: +; NoVLX-NEXT: .Lcfi2044: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -56901,8 +58892,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -56918,6 +58909,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -56944,12 +58936,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1735: +; NoVLX-NEXT: .Lcfi2045: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1736: +; NoVLX-NEXT: .Lcfi2046: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1737: +; NoVLX-NEXT: .Lcfi2047: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -56964,8 +58956,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -56981,6 +58973,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> @@ -57011,6 +59004,7 @@ ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57035,6 +59029,7 @@ ; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57060,6 +59055,7 @@ ; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57088,6 +59084,7 @@ ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57116,6 +59113,7 @@ ; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57145,6 +59143,7 @@ ; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57177,6 +59176,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57204,6 +59204,7 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57226,12 +59227,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1738: +; NoVLX-NEXT: .Lcfi2048: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1739: +; NoVLX-NEXT: .Lcfi2049: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1740: +; NoVLX-NEXT: .Lcfi2050: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -57278,6 +59279,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57299,12 +59301,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1741: +; NoVLX-NEXT: .Lcfi2051: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1742: +; NoVLX-NEXT: .Lcfi2052: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1743: +; NoVLX-NEXT: .Lcfi2053: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -57351,6 +59353,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57373,12 +59376,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1744: +; NoVLX-NEXT: .Lcfi2054: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1745: +; NoVLX-NEXT: .Lcfi2055: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1746: +; NoVLX-NEXT: .Lcfi2056: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -57425,6 +59428,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57449,12 +59453,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1747: +; NoVLX-NEXT: .Lcfi2057: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1748: +; NoVLX-NEXT: .Lcfi2058: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1749: +; NoVLX-NEXT: .Lcfi2059: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -57502,6 +59506,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57526,12 +59531,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1750: +; NoVLX-NEXT: .Lcfi2060: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1751: +; NoVLX-NEXT: .Lcfi2061: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1752: +; NoVLX-NEXT: .Lcfi2062: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -57579,6 +59584,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57604,12 +59610,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1753: +; NoVLX-NEXT: .Lcfi2063: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1754: +; NoVLX-NEXT: .Lcfi2064: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1755: +; NoVLX-NEXT: .Lcfi2065: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -57657,6 +59663,7 @@ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57686,6 +59693,7 @@ ; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57710,6 +59718,7 @@ ; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57732,53 +59741,53 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1756: +; NoVLX-NEXT: .Lcfi2066: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1757: +; NoVLX-NEXT: .Lcfi2067: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1758: +; NoVLX-NEXT: .Lcfi2068: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57789,6 +59798,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57810,53 +59820,53 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1759: +; NoVLX-NEXT: .Lcfi2069: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1760: +; NoVLX-NEXT: .Lcfi2070: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1761: +; NoVLX-NEXT: .Lcfi2071: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57867,6 +59877,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57889,53 +59900,53 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1762: +; NoVLX-NEXT: .Lcfi2072: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1763: +; NoVLX-NEXT: .Lcfi2073: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1764: +; NoVLX-NEXT: .Lcfi2074: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57946,6 +59957,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -57970,54 +59982,54 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1765: +; NoVLX-NEXT: .Lcfi2075: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1766: +; NoVLX-NEXT: .Lcfi2076: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1767: +; NoVLX-NEXT: .Lcfi2077: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -58028,6 +60040,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -58052,54 +60065,54 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1768: +; NoVLX-NEXT: .Lcfi2078: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1769: +; NoVLX-NEXT: .Lcfi2079: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1770: +; NoVLX-NEXT: .Lcfi2080: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -58110,6 +60123,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -58135,54 +60149,54 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1771: +; NoVLX-NEXT: .Lcfi2081: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1772: +; NoVLX-NEXT: .Lcfi2082: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1773: +; NoVLX-NEXT: .Lcfi2083: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -58193,6 +60207,7 @@ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -58223,6 +60238,7 @@ ; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> @@ -58248,6 +60264,7 @@ ; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> Index: test/CodeGen/X86/sse42-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/sse42-intrinsics-x86.ll +++ test/CodeGen/X86/sse42-intrinsics-x86.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42 ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=AVX2 -; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) { ; SSE42-LABEL: test_x86_sse42_pcmpestri128: