Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -2172,7 +2172,7 @@ // GR from/to 8-bit mask without native support def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), (COPY_TO_REGCLASS - (KMOVWkr (MOVZX32rr8 GR8 :$src)), VK8)>; + (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), VK8)>; def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), Index: test/CodeGen/X86/avx512-calling-conv.ll =================================================================== --- test/CodeGen/X86/avx512-calling-conv.ll +++ test/CodeGen/X86/avx512-calling-conv.ll @@ -277,7 +277,6 @@ ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: movb $85, %al -; KNL-NEXT: movzbl %al, %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} ; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} @@ -313,7 +312,6 @@ ; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL_X32-NEXT: vpsllvq LCPI7_0, %zmm0, %zmm0 ; KNL_X32-NEXT: movb $85, %al -; KNL_X32-NEXT: movzbl %al, %eax ; KNL_X32-NEXT: kmovw %eax, %k1 ; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} ; KNL_X32-NEXT: vpbroadcastd LCPI7_1, %zmm0 Index: test/CodeGen/X86/avx512-ext.ll =================================================================== --- test/CodeGen/X86/avx512-ext.ll +++ test/CodeGen/X86/avx512-ext.ll @@ -1312,8 +1312,7 @@ define <8 x i64> @zext_8i1_to_8xi64(i8 %b) { ; KNL-LABEL: zext_8i1_to_8xi64: ; KNL: ## BB#0: -; KNL-NEXT: movzbl %dil, %eax -; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; KNL-NEXT: retq ; Index: test/CodeGen/X86/avx512-fma-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-fma-intrinsics.ll +++ test/CodeGen/X86/avx512-fma-intrinsics.ll @@ -1,78 +1,104 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f --show-mc-encoding | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s declare <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_x86_vfnmadd_ps_z - ; CHECK: vfnmadd213ps %zmm +; CHECK-LABEL: test_x86_vfnmadd_ps_z: +; CHECK: ## BB#0: +; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_vfnmadd_ps - ; CHECK: vfnmadd213ps %zmm +; CHECK-LABEL: test_mask_vfnmadd_ps: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind ret <16 x float> %res } define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_x86_vfnmadd_pd_z - ; CHECK: vfnmadd213pd %zmm +; CHECK-LABEL: test_x86_vfnmadd_pd_z: +; CHECK: ## BB#0: +; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfnmadd_pd - ; CHECK: vfnmadd213pd %zmm +; CHECK-LABEL: test_mask_vfnmadd_pd: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_x86_vfnmsubps_z - ; CHECK: vfnmsub213ps %zmm +; CHECK-LABEL: test_x86_vfnmsubps_z: +; CHECK: ## BB#0: +; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_vfnmsub_ps - ; CHECK: vfnmsub213ps %zmm +; CHECK-LABEL: test_mask_vfnmsub_ps: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind ret <16 x float> %res } define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_x86_vfnmsubpd_z - ; CHECK: vfnmsub213pd %zmm +; CHECK-LABEL: test_x86_vfnmsubpd_z: +; CHECK: ## BB#0: +; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfnmsub_pd - ; CHECK: vfnmsub213pd %zmm +; CHECK-LABEL: test_mask_vfnmsub_pd: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_x86_vfmaddsubps_z - ; CHECK: vfmaddsub213ps %zmm +; CHECK-LABEL: test_x86_vfmaddsubps_z: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { ; CHECK-LABEL: test_mask_fmaddsub_ps: -; CHECK: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa6,0xc2] +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4) ret <16 x float> %res } @@ -80,16 +106,21 @@ declare <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_x86_vfmaddsubpd_z - ; CHECK: vfmaddsub213pd %zmm +; CHECK-LABEL: test_x86_vfmaddsubpd_z: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmaddsub_pd - ; CHECK: vfmaddsub213pd %zmm +; CHECK-LABEL: test_mask_vfmaddsub_pd: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } @@ -97,8 +128,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 @@ -115,8 +145,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmaddsub231pd %zmm1, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 @@ -133,8 +162,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1} {z} ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 @@ -200,8 +228,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmsubadd231pd %zmm1, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 @@ -231,71 +258,96 @@ } define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne - ; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn - ; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp - ; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz - ; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current - ; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne - ; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn - ; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp - ; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz - ; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current - ; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } @@ -305,8 +357,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmsub231pd %zmm1, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 @@ -336,71 +387,96 @@ } define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne - ; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn - ; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp - ; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz - ; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current - ; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne - ; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn - ; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp - ; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz - ; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current - ; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2] +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } @@ -408,8 +484,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 @@ -426,8 +501,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmadd231pd %zmm1, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 @@ -444,8 +518,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1} {z} ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 @@ -508,71 +581,96 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne - ; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xae,0xc2] +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn - ; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xae,0xc2] +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp - ; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xae,0xc2] +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz - ; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xae,0xc2] +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current - ; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xae,0xc2] +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne - ; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xae,0xc2] +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne: +; CHECK: ## BB#0: +; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn - ; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xae,0xc2] +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn: +; CHECK: ## BB#0: +; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp - ; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xae,0xc2] +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp: +; CHECK: ## BB#0: +; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz - ; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xae,0xc2] +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz: +; CHECK: ## BB#0: +; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current - ; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xae,0xc2] +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current: +; CHECK: ## BB#0: +; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } @@ -580,8 +678,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 @@ -598,8 +695,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfnmsub231pd %zmm1, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 @@ -646,8 +742,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -516,15 +516,18 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) { ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512: -; CHECK: kmovw %edi, %k1 -; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 - - %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1) - %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) - %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + + %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1) + %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) + %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) %res3 = fadd <16 x float> %res, %res1 %res4 = fadd <16 x float> %res2, %res3 ret <16 x float> %res4 @@ -534,15 +537,18 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) { ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512: -; CHECK: kmovw %eax, %k1 -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 -; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 - - %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1) - %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) - %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1) + %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) + %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) %res3 = fadd <8 x double> %res, %res1 %res4 = fadd <8 x double> %res2, %res3 ret <8 x double> %res4 @@ -581,8 +587,7 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 {%k1} {z} ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 @@ -643,8 +648,7 @@ define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-LABEL: test_mask_conflict_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpconflictq %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -689,8 +693,7 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-LABEL: test_mask_lzcnt_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -713,8 +716,7 @@ define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_mask_blend_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vblendmpd %zmm1, %zmm0, %zmm0 {%k1} ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1] @@ -724,8 +726,7 @@ define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x double>* %ptr, i8 %mask) { ; CHECK-LABEL: test_x86_mask_blend_pd_512_memop: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1} ; CHECK-NEXT: retq %b = load <8 x double>, <8 x double>* %ptr @@ -748,8 +749,7 @@ define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; CHECK-LABEL: test_x86_mask_blend_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64> %a1, <8 x i64> %a2, i8 %a0) ; <<8 x i64>> [#uses=1] @@ -825,8 +825,7 @@ define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpabsq %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpabsq %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 @@ -946,8 +945,7 @@ define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_pd: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovapd (%rdi), %zmm0 ; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} ; CHECK-NEXT: vmovapd (%rdi), %zmm1 {%k1} {z} @@ -961,12 +959,11 @@ } declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8) - + define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_pd: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovupd (%rdi), %zmm0 ; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} ; CHECK-NEXT: vmovupd (%rdi), %zmm1 {%k1} {z} @@ -993,8 +990,7 @@ define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) { ; CHECK-LABEL: test_mask_valign_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignq $2, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1064,8 +1060,7 @@ define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-LABEL: test_mask_pcmpeq_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: retq @@ -1111,8 +1106,7 @@ define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-LABEL: test_mask_pcmpgt_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: retq @@ -1374,8 +1368,7 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: test_mask_cmp_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %r8d ; CHECK-NEXT: vpcmpltq %zmm1, %zmm0, %k0 {%k1} @@ -1488,8 +1481,7 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: test_mask_ucmp_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpcmpequq %zmm1, %zmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %r8d ; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} @@ -1635,8 +1627,7 @@ define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_mask_pslli_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1647,8 +1638,7 @@ define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) @@ -1701,8 +1691,7 @@ define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_mask_psrli_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1713,8 +1702,7 @@ define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) @@ -1767,8 +1755,7 @@ define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_mask_psrai_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1779,8 +1766,7 @@ define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) @@ -1833,8 +1819,7 @@ define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_mask_psll_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1845,8 +1830,7 @@ define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_maskz_psll_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) @@ -1899,8 +1883,7 @@ define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_mask_psrl_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1911,8 +1894,7 @@ define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_maskz_psrl_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) @@ -1965,8 +1947,7 @@ define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_mask_psra_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1977,8 +1958,7 @@ define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_maskz_psra_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) @@ -2031,8 +2011,7 @@ define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_mask_psllv_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -2043,8 +2022,7 @@ define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_maskz_psllv_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) @@ -2098,8 +2076,7 @@ define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_mask_psrav_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -2110,8 +2087,7 @@ define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_maskz_psrav_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) @@ -2164,8 +2140,7 @@ define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_mask_psrlv_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -2176,8 +2151,7 @@ define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) @@ -2378,8 +2352,7 @@ define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) { ; CHECK-LABEL: test_vmulpd_mask_rn: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, @@ -2390,8 +2363,7 @@ define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) { ; CHECK-LABEL: test_vmulpd_mask_rd: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, @@ -2402,8 +2374,7 @@ define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) { ; CHECK-LABEL: test_vmulpd_mask_ru: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, @@ -2414,8 +2385,7 @@ define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) { ; CHECK-LABEL: test_vmulpd_mask_rz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, @@ -2501,8 +2471,7 @@ define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_xor_epi64: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -2524,8 +2493,7 @@ define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_or_epi64: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -2547,8 +2515,7 @@ define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_and_epi64: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -2779,8 +2746,7 @@ define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_add_epi64_rrk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -2791,8 +2757,7 @@ define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-LABEL: test_mask_add_epi64_rrkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) @@ -2812,8 +2777,7 @@ define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_add_epi64_rmk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -2825,8 +2789,7 @@ define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) { ; CHECK-LABEL: test_mask_add_epi64_rmkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %b = load <8 x i64>, <8 x i64>* %ptr_b @@ -2849,8 +2812,7 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_add_epi64_rmbk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -2864,8 +2826,7 @@ define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) { ; CHECK-LABEL: test_mask_add_epi64_rmbkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %q = load i64, i64* %ptr_b @@ -2889,8 +2850,7 @@ define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_sub_epi64_rrk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -2901,8 +2861,7 @@ define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-LABEL: test_mask_sub_epi64_rrkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) @@ -2922,8 +2881,7 @@ define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_sub_epi64_rmk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -2935,8 +2893,7 @@ define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) { ; CHECK-LABEL: test_mask_sub_epi64_rmkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %b = load <8 x i64>, <8 x i64>* %ptr_b @@ -2959,8 +2916,7 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_sub_epi64_rmbk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -2974,8 +2930,7 @@ define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) { ; CHECK-LABEL: test_mask_sub_epi64_rmbkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %q = load i64, i64* %ptr_b @@ -2999,8 +2954,7 @@ define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epi32_rrk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -3011,8 +2965,7 @@ define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epi32_rrkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) @@ -3032,8 +2985,7 @@ define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epi32_rmk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -3045,8 +2997,7 @@ define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epi32_rmkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b @@ -3070,8 +3021,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epi32_rmbk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -3086,8 +3036,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epi32_rmbkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %q = load i64, i64* %ptr_b @@ -3112,8 +3061,7 @@ define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epu32_rrk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -3124,8 +3072,7 @@ define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epu32_rrkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) @@ -3145,8 +3092,7 @@ define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epu32_rmk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -3158,8 +3104,7 @@ define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epu32_rmkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b @@ -3183,8 +3128,7 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epu32_rmbk: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -3199,8 +3143,7 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) { ; CHECK-LABEL: test_mask_mul_epu32_rmbkz: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %q = load i64, i64* %ptr_b @@ -4314,8 +4257,7 @@ define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 @@ -4347,8 +4289,7 @@ define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 @@ -4380,8 +4321,7 @@ define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 @@ -4411,8 +4351,7 @@ define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 @@ -4446,8 +4385,7 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3 ; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 @@ -4481,8 +4419,7 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3 ; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 @@ -4517,8 +4454,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm2 ; CHECK-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z} ; CHECK-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 @@ -4556,8 +4492,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3 ; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 {%k1} {z} ; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -4590,8 +4525,7 @@ define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 @@ -4622,8 +4556,7 @@ define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7] ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 @@ -4655,8 +4588,7 @@ define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6] ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 @@ -4688,8 +4620,7 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6] ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = k1[0],zmm0[0],k1[2],zmm0[2],k1[4],zmm0[4],k1[6],zmm0[6] ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] @@ -4709,8 +4640,7 @@ define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7] ; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 @@ -4778,8 +4708,7 @@ define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovqb %zmm0, (%rdi) ; CHECK-NEXT: vpmovqb %zmm0, (%rdi) {%k1} ; CHECK-NEXT: retq @@ -4861,8 +4790,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovqw %zmm0, %xmm0 @@ -4882,8 +4810,7 @@ define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovqw %zmm0, (%rdi) ; CHECK-NEXT: vpmovqw %zmm0, (%rdi) {%k1} ; CHECK-NEXT: retq @@ -4897,8 +4824,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsqw %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovsqw %zmm0, %xmm0 @@ -4932,8 +4858,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovusqw %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovusqw %zmm0, %xmm0 @@ -4967,8 +4892,7 @@ define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovqd %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vpmovqd %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpmovqd %zmm0, %ymm0 @@ -4988,8 +4912,7 @@ define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovqd %zmm0, (%rdi) ; CHECK-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; CHECK-NEXT: retq @@ -5003,8 +4926,7 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsqd %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpmovsqd %zmm0, %ymm0 @@ -5038,8 +4960,7 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovusqd %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpmovusqd %zmm0, %ymm0 @@ -5277,8 +5198,7 @@ define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 {%k1} ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 @@ -5310,8 +5230,7 @@ define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtpd2dq %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtpd2dq {rn-sae}, %zmm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 @@ -5327,8 +5246,7 @@ define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtpd2ps %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtpd2ps {ru-sae}, %zmm0, %ymm0 ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -5344,8 +5262,7 @@ define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtpd2udq {rn-sae}, %zmm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 @@ -5377,8 +5294,7 @@ define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm1 {%k1} ; CHECK-NEXT: vcvtps2pd {sae}, %ymm0, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 @@ -5410,8 +5326,7 @@ define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 @@ -5427,8 +5342,7 @@ define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm1 {%k1} ; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 @@ -5461,8 +5375,7 @@ define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 @@ -5703,8 +5616,7 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] @@ -5741,8 +5653,7 @@ define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 @@ -5758,8 +5669,7 @@ define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vgetmantpd $11, {sae}, %zmm0, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 @@ -5842,8 +5752,7 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufpd {{.*#+}} zmm2 = zmm2[0],k1[1],zmm2[3],k1[2],zmm2[5],k1[4],zmm2[6],k1[6] ; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = k1[0],zmm0[1],k1[3],zmm0[2],k1[5],zmm0[4],k1[6],zmm0[6] ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] @@ -5880,8 +5789,7 @@ define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 = zmm1[0,1,3,2,5,4,6,6] ; CHECK-NEXT: vpermilpd {{.*#+}} zmm2 = k1[0,1,3,2,5,4,6,6] ; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,3,2,5,4,6,6] @@ -5921,8 +5829,7 @@ define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm3 {%k1} {z} ; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 @@ -6002,8 +5909,7 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z} ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -6023,8 +5929,7 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z} ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -6112,8 +6017,7 @@ define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 @@ -6130,8 +6034,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} {z} ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 @@ -6188,8 +6091,7 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] @@ -6377,12 +6279,14 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512: -; CHECK: kmovw %edi, %k1 -; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm2 {%k1} {z} -; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm1 {%k1} -; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm0 -; CHECK: vaddps %zmm1, %zmm0, %zmm0 -; CHECK: vaddps %zmm0, %zmm2, %zmm0 +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1) %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask) @@ -6396,12 +6300,14 @@ define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: -; CHECK: kmovw %eax, %k1 -; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm2 {%k1} {z} -; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm1 {%k1} -; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm0 -; CHECK: vaddpd %zmm1, %zmm0, %zmm0 -; CHECK: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 = zmm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1) %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) @@ -6415,12 +6321,14 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512: -; CHECK: kmovw %edi, %k1 -; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm2 {%k1} {z} -; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm1 {%k1} -; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm0 -; CHECK: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1) %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) @@ -6434,12 +6342,14 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512: -; CHECK: kmovw %eax, %k1 -; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm2 {%k1} {z} -; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm1 {%k1} -; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm0 -; CHECK: vpaddq %zmm1, %zmm0, %zmm0 -; CHECK: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1) %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) @@ -6454,8 +6364,7 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm2 {%k1} {z} ; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm0 @@ -6515,8 +6424,7 @@ define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsraq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpsraq $3, %zmm0, %zmm2 {%k1} {z} ; CHECK-NEXT: vpsraq $3, %zmm0, %zmm0 @@ -6556,8 +6464,7 @@ define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsllq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpsllq $3, %zmm0, %zmm2 {%k1} {z} ; CHECK-NEXT: vpsllq $3, %zmm0, %zmm0 @@ -6577,13 +6484,13 @@ define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i16 %x1, <16 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpshufd $3, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpshufd $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpshufd $3, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpshufd $3, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpshufd $3, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpshufd $3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> %x2, i8 %x3) %res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> zeroinitializer, i8 %x3) %res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> %x2, i8 -1) @@ -6597,13 +6504,13 @@ define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) @@ -6617,14 +6524,13 @@ define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -6638,13 +6544,13 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprold $3, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vprold $3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3) %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3) %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1) @@ -6658,14 +6564,13 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3) %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1) @@ -6697,8 +6602,7 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_q: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 {%k1} {z} @@ -6716,13 +6620,13 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpmovzxbd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpmovzxbd %xmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpmovzxbd %xmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxbd %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vpmovzxbd %xmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpmovzxbd %xmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) @@ -6736,14 +6640,13 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxbq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpmovzxbq %xmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpmovzxbq %xmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxbq %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vpmovzxbq %xmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpmovzxbq %xmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) @@ -6757,14 +6660,13 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxdq %ymm0, %zmm1 {%k1} -; CHECK-NEXT: vpmovzxdq %ymm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpmovzxdq %ymm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxdq %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vpmovzxdq %ymm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpmovzxdq %ymm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) @@ -6778,13 +6680,13 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpmovzxwd %ymm0, %zmm1 {%k1} -; CHECK-NEXT: vpmovzxwd %ymm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpmovzxwd %ymm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxwd %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vpmovzxwd %ymm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpmovzxwd %ymm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) @@ -6798,14 +6700,13 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxwq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpmovzxwq %xmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpmovzxwq %xmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxwq %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vpmovzxwq %xmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpmovzxwq %xmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) @@ -6819,13 +6720,13 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpmovsxbd %xmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vpmovsxbd %xmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) @@ -6839,14 +6740,13 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpmovsxbq %xmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vpmovsxbq %xmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) @@ -6860,14 +6760,13 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1} -; CHECK-NEXT: vpmovsxdq %ymm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vpmovsxdq %ymm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) @@ -6882,13 +6781,13 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1} -; CHECK-NEXT: vpmovsxwd %ymm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) @@ -6903,14 +6802,13 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpmovsxwq %xmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vpmovsxwq %xmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) @@ -6924,14 +6822,13 @@ define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i8 %x1, <8 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpermpd $3, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpermpd $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpermpd $3, %zmm0, %zmm0 -; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpermpd $3, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpermpd $3, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpermpd $3, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i8 3, <8 x double> %x2, i8 %x3) %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i8 3, <8 x double> zeroinitializer, i8 %x3) %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i8 3, <8 x double> %x2, i8 -1) @@ -6945,14 +6842,13 @@ define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpermq $3, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpermq $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpermq $3, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpermq $3, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpermq $3, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpermq $3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3) %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1) @@ -6962,18 +6858,17 @@ } declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) - + define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) %res2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) @@ -6987,14 +6882,13 @@ define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_512: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpermq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpermq %zmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vpermq %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermq %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpermq %zmm1, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vpermq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) %res2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -7010,13 +6904,13 @@ define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpermps %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpermps %zmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermps %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpermps %zmm1, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) %res1 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) %res2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) @@ -7030,13 +6924,13 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpermd %zmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpermd %zmm1, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i8 %x3) %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i8 %x3) %res2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i8 -1) Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -18,8 +18,7 @@ define i8 @mask8(i8 %x) { ; KNL-LABEL: mask8: ; KNL: ## BB#0: -; KNL-NEXT: movzbl %dil, %eax -; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: retq @@ -232,7 +231,6 @@ ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: movb $85, %al -; KNL-NEXT: movzbl %al, %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1337,8 +1335,7 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-LABEL: test18: ; KNL: ## BB#0: -; KNL-NEXT: movzbl %dil, %eax -; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kshiftlw $7, %k1, %k2 ; KNL-NEXT: kshiftrw $15, %k2, %k2 Index: test/CodeGen/X86/avx512-select.ll =================================================================== --- test/CodeGen/X86/avx512-select.ll +++ test/CodeGen/X86/avx512-select.ll @@ -1,62 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -; CHECK-LABEL: select00 -; CHECK: vmovaps -; CHECK-NEXT: LBB define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind { +; CHECK-LABEL: select00: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: cmpl $255, %edi +; CHECK-NEXT: je LBB0_2 +; CHECK-NEXT: ## BB#1: +; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: LBB0_2: +; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %cmpres = icmp eq i32 %a, 255 %selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b %res = xor <16 x i32> %b, %selres ret <16 x i32> %res } -; CHECK-LABEL: select01 -; CHECK: vmovaps -; CHECK-NEXT: LBB define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind { +; CHECK-LABEL: select01: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: cmpl $255, %edi +; CHECK-NEXT: je LBB1_2 +; CHECK-NEXT: ## BB#1: +; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: LBB1_2: +; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %cmpres = icmp eq i32 %a, 255 %selres = select i1 %cmpres, <8 x i64> zeroinitializer, <8 x i64> %b %res = xor <8 x i64> %b, %selres ret <8 x i64> %res } -; CHECK-LABEL: @select02 -; CHECK: cmpless %xmm0, %xmm3, %k1 -; CHECK-NEXT: vmovss %xmm2, {{.*}}%xmm1 {%k1} -; CHECK: ret define float @select02(float %a, float %b, float %c, float %eps) { +; CHECK-LABEL: select02: +; CHECK: ## BB#0: +; CHECK-NEXT: vcmpless %xmm0, %xmm3, %k1 +; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %cmp = fcmp oge float %a, %eps %cond = select i1 %cmp, float %c, float %b ret float %cond } -; CHECK-LABEL: @select03 -; CHECK: cmplesd %xmm0, %xmm3, %k1 -; CHECK-NEXT: vmovsd %xmm2, {{.*}}%xmm1 {%k1} -; CHECK: ret define double @select03(double %a, double %b, double %c, double %eps) { +; CHECK-LABEL: select03: +; CHECK: ## BB#0: +; CHECK-NEXT: vcmplesd %xmm0, %xmm3, %k1 +; CHECK-NEXT: vmovsd %xmm2, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %cmp = fcmp oge double %a, %eps %cond = select i1 %cmp, double %c, double %b ret double %cond } -; CHECK-LABEL: @select04 -; CHECK: vmovaps %zmm3, %zmm1 -; CHECK-NEXT: ret -; PR20677 define <16 x double> @select04(<16 x double> %a, <16 x double> %b) { +; CHECK-LABEL: select04: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovaps %zmm3, %zmm1 +; CHECK-NEXT: retq %sel = select <16 x i1> , <16 x double> %a, <16 x double> %b ret <16 x double> %sel } -; CHECK-LABEL: select05 -; CHECK: movzbl %sil, %eax -; CHECK: kmovw %eax, %k0 -; CHECK: movzbl %dil, %eax -; CHECK: kmovw %eax, %k1 -; CHECK-NEXT: korw %k1, %k0, %k0 -; CHECK-NEXT: kmovw %k0, %eax define i8 @select05(i8 %a.0, i8 %m) { +; CHECK-LABEL: select05: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq %mask = bitcast i8 %m to <8 x i1> %a = bitcast i8 %a.0 to <8 x i1> %r = select <8 x i1> %mask, <8 x i1> , <8 x i1> %a @@ -64,14 +83,14 @@ ret i8 %res; } -; CHECK-LABEL: select06 -; CHECK: movzbl %sil, %eax -; CHECK: kmovw %eax, %k0 -; CHECK: movzbl %dil, %eax -; CHECK: kmovw %eax, %k1 -; CHECK-NEXT: kandw %k1, %k0, %k0 -; CHECK-NEXT: kmovw %k0, %eax define i8 @select06(i8 %a.0, i8 %m) { +; CHECK-LABEL: select06: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kandw %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq %mask = bitcast i8 %m to <8 x i1> %a = bitcast i8 %a.0 to <8 x i1> %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer @@ -79,19 +98,18 @@ ret i8 %res; } -; CHECK-LABEL: select07 -; CHECK-DAG: movzbl %dl, %eax -; CHECK-DAG: kmovw %eax, %k0 -; CHECK-DAG: movzbl %dil, %eax -; CHECK-DAG: kmovw %eax, %k1 -; CHECK-DAG: movzbl %sil, %eax -; CHECK-DAG: kmovw %eax, %k2 -; CHECK: kandw %k0, %k1, %k1 -; CHECK-NEXT: knotw %k0, %k0 -; CHECK-NEXT: kandw %k0, %k2, %k0 -; CHECK-NEXT: korw %k0, %k1, %k0 -; CHECK-NEXT: kmovw %k0, %eax define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) { +; CHECK-LABEL: select07: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovw %esi, %k2 +; CHECK-NEXT: kandw %k0, %k1, %k1 +; CHECK-NEXT: knotw %k0, %k0 +; CHECK-NEXT: kandw %k0, %k2, %k0 +; CHECK-NEXT: korw %k0, %k1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq %mask = bitcast i8 %m to <8 x i1> %a = bitcast i8 %a.0 to <8 x i1> %b = bitcast i8 %b.0 to <8 x i1> Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -651,8 +651,7 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 @@ -669,8 +668,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 @@ -687,8 +685,7 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} {z} ; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 @@ -703,8 +700,7 @@ define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} ; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 @@ -721,8 +717,7 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1} ; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 @@ -739,8 +734,7 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} {z} ; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 @@ -755,8 +749,7 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 @@ -773,8 +766,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 @@ -791,8 +783,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} {z} ; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 @@ -807,8 +798,7 @@ define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} ; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 @@ -825,8 +815,7 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1} ; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 @@ -843,8 +832,7 @@ define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} {z} ; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 @@ -862,8 +850,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 @@ -881,8 +868,7 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1} ; CHECK-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 @@ -899,8 +885,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 @@ -917,8 +902,7 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1} ; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 @@ -1006,8 +990,7 @@ define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 @@ -1024,8 +1007,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 @@ -1040,8 +1022,7 @@ define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm3 {%k1} ; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 @@ -1058,8 +1039,7 @@ define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1} ; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 @@ -1074,8 +1054,7 @@ define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 @@ -1092,8 +1071,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 @@ -1108,8 +1086,7 @@ define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm3 {%k1} ; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 @@ -1126,8 +1103,7 @@ define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1} ; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 @@ -1142,8 +1118,7 @@ define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 @@ -1158,8 +1133,7 @@ define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm3 {%k1} ; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 @@ -1174,8 +1148,7 @@ define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 @@ -1190,8 +1163,7 @@ define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm3 {%k1} ; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 @@ -1242,8 +1214,7 @@ define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 @@ -1260,8 +1231,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 @@ -1278,8 +1248,7 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} {z} ; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 @@ -1294,8 +1263,7 @@ define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} ; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 @@ -1312,8 +1280,7 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1} ; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 @@ -1330,8 +1297,7 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} {z} ; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 @@ -1346,8 +1312,7 @@ define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 @@ -1364,8 +1329,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 @@ -1382,8 +1346,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} {z} ; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 @@ -1398,8 +1361,7 @@ define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} ; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 @@ -1416,8 +1378,7 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1} ; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 @@ -1434,8 +1395,7 @@ define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} {z} ; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 @@ -1452,8 +1412,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 @@ -1470,8 +1429,7 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1} ; CHECK-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 @@ -1488,8 +1446,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 @@ -1506,8 +1463,7 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm2, %zmm3 ; CHECK-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1} ; CHECK-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 @@ -4037,8 +3993,7 @@ define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 @@ -4054,8 +4009,7 @@ define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 @@ -4071,8 +4025,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 @@ -4256,8 +4209,7 @@ define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm0 @@ -4357,8 +4309,7 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -4490,8 +4441,7 @@ define <8 x i16>@test_int_x86_avx512_cvtmask2w_128(i8 %x0) { ; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k0 +; CHECK-NEXT: kmovw %edi, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: retq %res = call <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8 %x0) @@ -4515,8 +4465,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 @@ -4556,8 +4505,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 @@ -4617,8 +4565,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_hi: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 @@ -4638,8 +4585,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psra_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0 @@ -4659,8 +4605,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsraw $3, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpsraw $3, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpsraw $3, %xmm0, %xmm0 @@ -4720,8 +4665,7 @@ define <4 x i32>@test_int_x86_avx512_mask_pshuf_d_128(<4 x i32> %x0, i16 %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpshufd $3, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpshufd $3, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpshufd $3, %xmm0, %xmm0 @@ -4742,8 +4686,7 @@ define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i16 %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpshufd $3, %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vpshufd $3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpshufd $3, %ymm0, %ymm0 @@ -4764,8 +4707,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0 @@ -4807,8 +4749,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0 @@ -4870,8 +4811,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psrav8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_hi: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm0 @@ -4892,8 +4832,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psll_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm0 @@ -4933,8 +4872,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsllw $3, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpsllw $3, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpsllw $3, %xmm0, %xmm0 @@ -4994,8 +4932,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_hi: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 @@ -5015,8 +4952,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovzxbw %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovzxbw %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0 @@ -5057,8 +4993,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovsxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovsxbw %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 @@ -5098,8 +5033,7 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovsxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 @@ -5119,8 +5053,7 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1} ; CHECK-NEXT: vpmovsxdq %xmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 @@ -5140,8 +5073,7 @@ define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermw %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpermw %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpermw %xmm1, %xmm0, %xmm0 Index: test/CodeGen/X86/avx512cdvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readonly @@ -7,8 +8,7 @@ define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vplzcntd %xmm0, %xmm0 @@ -28,8 +28,7 @@ define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vplzcntd %ymm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 @@ -45,8 +44,7 @@ define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vplzcntq %xmm0, %xmm0 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 @@ -62,8 +60,7 @@ define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntq %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vplzcntq %ymm0, %ymm0 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 @@ -79,8 +76,7 @@ define <4 x i32>@test_int_x86_avx512_mask_vpconflict_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpconflictd %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpconflictd %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpconflictd %xmm0, %xmm0 @@ -100,8 +96,7 @@ define <8 x i32>@test_int_x86_avx512_mask_vpconflict_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpconflictd %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vpconflictd %ymm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 @@ -117,8 +112,7 @@ define <2 x i64>@test_int_x86_avx512_mask_vpconflict_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpconflictq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpconflictq %xmm0, %xmm0 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 @@ -134,8 +128,7 @@ define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpconflictq %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vpconflictq %ymm0, %ymm0 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 @@ -147,33 +140,45 @@ } define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) { - ; CHECK: test_x86_vbroadcastmw_256 - ; CHECK: vpbroadcastmw2d %k0, %ymm0 - %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ; +; CHECK-LABEL: test_x86_vbroadcastmw_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ; ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16) define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) { - ; CHECK: test_x86_vbroadcastmw_128 - ; CHECK: vpbroadcastmw2d %k0, %xmm0 - %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ; +; CHECK-LABEL: test_x86_vbroadcastmw_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ; ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16) define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) { - ; CHECK: test_x86_broadcastmb_256 - ; CHECK: vpbroadcastmb2q %k0, %ymm0 - %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ; +; CHECK-LABEL: test_x86_broadcastmb_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ; ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8) define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) { - ; CHECK: test_x86_broadcastmb_128 - ; CHECK: vpbroadcastmb2q %k0, %xmm0 - %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ; +; CHECK-LABEL: test_x86_broadcastmb_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ; ret <2 x i64> %res } declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8) Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4016,8 +4016,7 @@ define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0 ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 @@ -4033,8 +4032,7 @@ define <4 x double>@test_int_x86_avx512_mask_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 ; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 @@ -4050,8 +4048,7 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_dq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -4067,8 +4064,7 @@ define <8 x float>@test_int_x86_avx512_mask_cvt_dq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -4084,8 +4080,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4101,8 +4096,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4118,8 +4112,7 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_256(<4 x double> %x0, <4 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -4135,8 +4128,7 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -4152,8 +4144,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4169,8 +4160,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4186,8 +4176,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4203,8 +4192,7 @@ define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 @@ -4220,8 +4208,7 @@ define <2 x double>@test_int_x86_avx512_mask_cvt_ps2pd_128(<4 x float> %x0, <2 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0 ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 @@ -4237,8 +4224,7 @@ define <4 x double>@test_int_x86_avx512_mask_cvt_ps2pd_256(<4 x float> %x0, <4 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 ; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 @@ -4254,8 +4240,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4271,8 +4256,7 @@ define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 @@ -4288,8 +4272,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4305,8 +4288,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4322,8 +4304,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4339,8 +4320,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} ; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4356,8 +4336,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4373,8 +4352,7 @@ define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 @@ -4390,8 +4368,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 @@ -4407,8 +4384,7 @@ define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 @@ -4424,8 +4400,7 @@ define <2 x double>@test_int_x86_avx512_mask_cvt_udq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0 ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 @@ -4441,8 +4416,7 @@ define <4 x double>@test_int_x86_avx512_mask_cvt_udq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0 ; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 @@ -4458,8 +4432,7 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_udq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -4475,8 +4448,7 @@ define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -4545,8 +4517,7 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} @@ -4569,8 +4540,7 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: ## ymm2 = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} @@ -4593,8 +4563,7 @@ define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 @@ -4612,8 +4581,7 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: ## ymm2 = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 @@ -4651,8 +4619,7 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm0 @@ -4672,8 +4639,7 @@ define <4 x double>@test_int_x86_avx512_mask_getmant_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm0 ; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 @@ -4689,8 +4655,7 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -4706,8 +4671,7 @@ define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm0 ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -4723,8 +4687,7 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: ## xmm2 = xmm2[0],k1[1] ; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} @@ -4747,8 +4710,7 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: ## ymm2 = ymm2[0],k1[1],ymm2[3],k1[2] ; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm0 @@ -4766,8 +4728,7 @@ define <4 x float>@test_int_x86_avx512_mask_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: ## xmm2 = xmm2[2,1],k1[1,0] ; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm0 @@ -4785,8 +4746,7 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: ## ymm2 = ymm2[2,1],k1[1,0],ymm2[6,5],k1[5,4] ; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm0 @@ -4804,8 +4764,7 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm0 @@ -4825,8 +4784,7 @@ define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd $22, %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: valignd $22, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 @@ -4842,8 +4800,7 @@ define <2 x i64>@test_int_x86_avx512_mask_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignq $22, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: valignq $22, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 @@ -4859,8 +4816,7 @@ define <4 x i64>@test_int_x86_avx512_mask_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignq $22, %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: valignq $22, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 @@ -4876,8 +4832,7 @@ define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm1 {%k1} ; CHECK-NEXT: ## ymm1 = ymm1[0,1,3,2] ; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm2 {%k1} {z} @@ -4900,8 +4855,7 @@ define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: ## xmm1 = xmm1[1,0] ; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm2 {%k1} {z} @@ -4924,8 +4878,7 @@ define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} ; CHECK-NEXT: ## ymm1 = ymm1[2,1,1,0,6,5,5,4] ; CHECK-NEXT: vpermilps $22, %ymm0, %ymm2 {%k1} {z} @@ -4948,8 +4901,7 @@ define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: ## xmm1 = xmm1[2,1,1,0] ; CHECK-NEXT: vpermilps $22, %xmm0, %xmm2 {%k1} {z} @@ -4972,8 +4924,7 @@ define <4 x double>@test_int_x86_avx512_mask_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 @@ -4993,8 +4944,7 @@ define <2 x double>@test_int_x86_avx512_mask_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 @@ -5014,8 +4964,7 @@ define <8 x float>@test_int_x86_avx512_mask_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0 @@ -5035,8 +4984,7 @@ define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0 @@ -5056,8 +5004,7 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 @@ -5077,8 +5024,7 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 @@ -5099,8 +5045,7 @@ define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 @@ -5117,8 +5062,7 @@ define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} {z} ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 @@ -5135,8 +5079,7 @@ define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 @@ -5153,8 +5096,7 @@ define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} {z} ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 @@ -5171,8 +5113,7 @@ define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 @@ -5189,8 +5130,7 @@ define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} {z} ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 @@ -5207,8 +5147,7 @@ define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 @@ -5225,8 +5164,7 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} {z} ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 @@ -5243,8 +5181,7 @@ define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} ; CHECK-NEXT: vpbroadcastd %xmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 @@ -5264,8 +5201,7 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 @@ -5285,8 +5221,7 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 @@ -5306,8 +5241,7 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 @@ -5324,7 +5258,7 @@ define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) { ; CHECK: test_x86_vcvtph2ps_128 - ; CHECK: vcvtph2ps %xmm0, %xmm0 + ; CHECK: vcvtph2ps %xmm0, %xmm0 %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1) ret <4 x float> %res } @@ -5355,7 +5289,7 @@ define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) { ; CHECK: test_x86_vcvtph2ps_256_rrk - ; CHECK: vcvtph2ps %xmm0, %ymm1 {%k1} + ; CHECK: vcvtph2ps %xmm0, %ymm1 {%k1} %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask) ret <8 x float> %res } @@ -5393,17 +5327,16 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ; CHECK-NEXT: ## xmm1 = xmm0[0,0,2,2] -; CHECK-NEXT: vmovsldup %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovsldup %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2] -; CHECK-NEXT: vmovsldup %xmm0, %xmm0 +; CHECK-NEXT: vmovsldup %xmm0, %xmm0 ; CHECK-NEXT: ## xmm0 = xmm0[0,0,2,2] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) @@ -5417,17 +5350,16 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovsldup %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovsldup %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovsldup %ymm0, %ymm0 +; CHECK-NEXT: vmovsldup %ymm0, %ymm0 ; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) @@ -5441,17 +5373,16 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ; CHECK-NEXT: ## xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: vmovshdup %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovshdup %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3] -; CHECK-NEXT: vmovshdup %xmm0, %xmm0 +; CHECK-NEXT: vmovshdup %xmm0, %xmm0 ; CHECK-NEXT: ## xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) @@ -5465,17 +5396,16 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ; CHECK-NEXT: ## ymm1 = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vmovshdup %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovshdup %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vmovshdup %ymm0, %ymm0 +; CHECK-NEXT: vmovshdup %ymm0, %ymm0 ; CHECK-NEXT: ## ymm0 = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) @@ -5488,8 +5418,7 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ; CHECK-NEXT: ## xmm1 = xmm0[0,0] ; CHECK-NEXT: vmovddup %xmm0, %xmm2 {%k1} {z} @@ -5512,8 +5441,7 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2] ; CHECK-NEXT: vmovddup %ymm0, %ymm2 {%k1} {z} @@ -5714,15 +5642,15 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) { ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256: -; CHECK: kmovw %eax, %k1 -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 - - %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) - %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask) - %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask) +; CHECK: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 + + %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) + %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask) + %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask) %res3 = fadd <4 x double> %res, %res1 %res4 = fadd <4 x double> %res2, %res3 ret <4 x double> %res4 @@ -5731,15 +5659,15 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) { ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256: -; CHECK: kmovw %eax, %k1 +; CHECK: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 - %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) - %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask) - %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) + %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask) + %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask) %res3 = fadd <8 x float> %res, %res1 %res4 = fadd <8 x float> %res2, %res3 ret <8 x float> %res4 @@ -5748,15 +5676,15 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) { ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128: -; CHECK: kmovw %eax, %k1 +; CHECK: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 - %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) - %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) - %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) + %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) + %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) + %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) %res3 = fadd <4 x float> %res, %res1 %res4 = fadd <4 x float> %res2, %res3 ret <4 x float> %res4 @@ -5768,7 +5696,7 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256: -; CHECK: kmovw %eax, %k1 +; CHECK: kmovw %edi, %k1 ; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm0 @@ -5787,7 +5715,7 @@ define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256: -; CHECK: kmovw %eax, %k1 +; CHECK: kmovw %edi, %k1 ; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm0 @@ -5807,8 +5735,7 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 @@ -5828,8 +5755,7 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 @@ -5849,8 +5775,7 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm0 @@ -5870,8 +5795,7 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm0 @@ -5889,8 +5813,7 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 @@ -5910,8 +5833,7 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 @@ -5931,8 +5853,7 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrld $255, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpsrld $255, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpsrld $255, %xmm0, %xmm0 @@ -5952,8 +5873,7 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrld $255, %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vpsrld $255, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpsrld $255, %ymm0, %ymm0 @@ -5993,8 +5913,7 @@ define <2 x i64>@test_int_x86_avx512_mask_psrlv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv2_di: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 @@ -6014,8 +5933,7 @@ define <4 x i64>@test_int_x86_avx512_mask_psrlv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_di: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 @@ -6035,8 +5953,7 @@ define <4 x i32>@test_int_x86_avx512_mask_psrlv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_si: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 @@ -6056,8 +5973,7 @@ define <8 x i32>@test_int_x86_avx512_mask_psrlv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_si: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 @@ -6077,8 +5993,7 @@ define <4 x i32>@test_int_x86_avx512_mask_psra_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm0 @@ -6098,8 +6013,7 @@ define <8 x i32>@test_int_x86_avx512_mask_psra_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0 @@ -6119,8 +6033,7 @@ define <4 x i32>@test_int_x86_avx512_mask_psra_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrad $3, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpsrad $3, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpsrad $3, %xmm0, %xmm0 @@ -6140,8 +6053,7 @@ define <8 x i32>@test_int_x86_avx512_mask_psra_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrad $3, %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vpsrad $3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0 @@ -6161,8 +6073,7 @@ define <2 x i64>@test_int_x86_avx512_mask_psra_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0 @@ -6182,8 +6093,7 @@ define <4 x i64>@test_int_x86_avx512_mask_psra_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0 @@ -6203,8 +6113,7 @@ define <2 x i64>@test_int_x86_avx512_mask_psra_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsraq $3, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpsraq $3, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpsraq $3, %xmm0, %xmm0 @@ -6224,8 +6133,7 @@ define <4 x i64>@test_int_x86_avx512_mask_psra_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsraq $3, %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vpsraq $3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpsraq $3, %ymm0, %ymm0 @@ -6246,8 +6154,7 @@ define <4 x i32>@test_int_x86_avx512_mask_psll_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm0 @@ -6267,8 +6174,7 @@ define <8 x i32>@test_int_x86_avx512_mask_psll_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0 @@ -6288,8 +6194,7 @@ define <4 x i32>@test_int_x86_avx512_mask_psll_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpslld $3, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 @@ -6309,8 +6214,7 @@ define <8 x i32>@test_int_x86_avx512_mask_psll_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpslld $3, %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vpslld $3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpslld $3, %ymm0, %ymm0 @@ -6330,8 +6234,7 @@ define <4 x i64>@test_int_x86_avx512_mask_psll_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0 @@ -6351,8 +6254,7 @@ define <2 x i64>@test_int_x86_avx512_mask_psll_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsllq $3, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vpsllq $3, %xmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpsllq $3, %xmm0, %xmm0 @@ -6372,8 +6274,7 @@ define <4 x i64>@test_int_x86_avx512_mask_psll_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsllq $3, %ymm0, %ymm1 {%k1} ; CHECK-NEXT: vpsllq $3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0 @@ -6391,8 +6292,7 @@ define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovaps (%rdi), %ymm0 ; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} ; CHECK-NEXT: vmovaps (%rdi), %ymm1 {%k1} {z} @@ -6410,8 +6310,7 @@ define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} ; CHECK-NEXT: vmovups (%rdi), %ymm1 {%k1} {z} @@ -6429,8 +6328,7 @@ define <4 x double> @test_mask_load_aligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovapd (%rdi), %ymm0 ; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} ; CHECK-NEXT: vmovapd (%rdi), %ymm1 {%k1} {z} @@ -6448,8 +6346,7 @@ define <4 x double> @test_mask_load_unaligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovupd (%rdi), %ymm0 ; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} ; CHECK-NEXT: vmovupd (%rdi), %ymm1 {%k1} {z} @@ -6467,8 +6364,7 @@ define <4 x float> @test_mask_load_aligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ; CHECK-NEXT: vmovaps (%rdi), %xmm1 {%k1} {z} @@ -6486,8 +6382,7 @@ define <4 x float> @test_mask_load_unaligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovups (%rdi), %xmm0 ; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ; CHECK-NEXT: vmovups (%rdi), %xmm1 {%k1} {z} @@ -6505,8 +6400,7 @@ define <2 x double> @test_mask_load_aligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovapd (%rdi), %xmm0 ; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} ; CHECK-NEXT: vmovapd (%rdi), %xmm1 {%k1} {z} @@ -6524,8 +6418,7 @@ define <2 x double> @test_mask_load_unaligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovupd (%rdi), %xmm0 ; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} ; CHECK-NEXT: vmovupd (%rdi), %xmm1 {%k1} {z} @@ -6545,8 +6438,7 @@ define <4 x i32>@test_int_x86_avx512_mask_psrav4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrav4_si: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0 @@ -6566,8 +6458,7 @@ define <8 x i32>@test_int_x86_avx512_mask_psrav8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 @@ -6587,8 +6478,7 @@ define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm0 @@ -6608,8 +6498,7 @@ define <4 x i64>@test_int_x86_avx512_mask_psrav_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm0 @@ -6629,8 +6518,7 @@ define <2 x i64>@test_int_x86_avx512_mask_psllv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psllv2_di: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -6650,8 +6538,7 @@ define <4 x i64>@test_int_x86_avx512_mask_psllv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_di: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 @@ -6671,8 +6558,7 @@ define <4 x i32>@test_int_x86_avx512_mask_psllv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_si: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 @@ -6692,8 +6578,7 @@ define <8 x i32>@test_int_x86_avx512_mask_psllv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_si: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm3 {%k1} {z} ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 @@ -6713,14 +6598,13 @@ define <4 x i32>@test_int_x86_avx512_mask_prorv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm3 {%k1} {z} -; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -6734,14 +6618,13 @@ define <8 x i32>@test_int_x86_avx512_mask_prorv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm3 {%k1} {z} -; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -6755,14 +6638,13 @@ define <2 x i64>@test_int_x86_avx512_mask_prorv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm3 {%k1} {z} -; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -6776,14 +6658,13 @@ define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm3 {%k1} {z} -; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -6796,14 +6677,13 @@ define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vprold $3, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vprold $3, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1) @@ -6817,14 +6697,13 @@ define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vprold $3, %ymm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vprold $3, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1) @@ -6838,14 +6717,13 @@ define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1) @@ -6859,14 +6737,13 @@ define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1) @@ -6880,8 +6757,7 @@ define <4 x i32> @test_mask_load_aligned_d_128(<4 x i32> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 ; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} ; CHECK-NEXT: vmovdqa32 (%rdi), %xmm1 {%k1} {z} @@ -6899,8 +6775,7 @@ define <8 x i32> @test_mask_load_aligned_d_256(<8 x i32> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm1 {%k1} {z} @@ -6918,8 +6793,7 @@ define <2 x i64> @test_mask_load_aligned_q_128(<2 x i64> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 ; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} ; CHECK-NEXT: vmovdqa64 (%rdi), %xmm1 {%k1} {z} @@ -6937,8 +6811,7 @@ define <4 x i64> @test_mask_load_aligned_q_256(<4 x i64> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm1 {%k1} {z} @@ -6956,11 +6829,10 @@ define <4 x i32>@test_int_x86_avx512_mask_prolv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm3 {%k1} {z} -; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -6977,14 +6849,13 @@ define <8 x i32>@test_int_x86_avx512_mask_prolv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm3 {%k1} {z} -; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -6998,14 +6869,13 @@ define <2 x i64>@test_int_x86_avx512_mask_prolv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm3 {%k1} {z} -; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -7019,14 +6889,13 @@ define <4 x i64>@test_int_x86_avx512_mask_prolv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm3 {%k1} {z} -; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -7040,14 +6909,13 @@ define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vprord $3, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vprord $3, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1) @@ -7061,14 +6929,13 @@ define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vprord $3, %ymm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vprord $3, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1) @@ -7082,14 +6949,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1) @@ -7103,14 +6969,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1) @@ -7124,14 +6989,13 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovzxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovzxbd %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vpmovzxbd %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovzxbd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovzxbd %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) @@ -7145,14 +7009,13 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovzxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovzxbd %xmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovzxbd %xmm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovzxbd %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovzxbd %xmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) @@ -7166,14 +7029,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovzxbq %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vpmovzxbq %xmm0, %xmm0 -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovzxbq %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovzxbq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) @@ -7187,14 +7049,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovzxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovzxbq %xmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovzxbq %xmm0, %ymm0 -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovzxbq %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovzxbq %xmm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) @@ -7208,14 +7069,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovzxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovzxdq %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vpmovzxdq %xmm0, %xmm0 -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovzxdq %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovzxdq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1) @@ -7229,14 +7089,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovzxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovzxdq %xmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovzxdq %xmm0, %ymm0 -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovzxdq %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovzxdq %xmm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1) @@ -7250,14 +7109,13 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovzxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovzxwd %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) @@ -7271,14 +7129,13 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovzxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovzxwd %xmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovzxwd %xmm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovzxwd %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovzxwd %xmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) @@ -7292,14 +7149,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovzxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovzxwq %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vpmovzxwq %xmm0, %xmm0 -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovzxwq %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovzxwq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) @@ -7313,14 +7169,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovzxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovzxwq %xmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovzxwq %xmm0, %ymm0 -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovzxwq %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovzxwq %xmm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) @@ -7334,14 +7189,13 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovsxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovsxbd %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) @@ -7355,14 +7209,13 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovsxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovsxbd %xmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsxbd %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) @@ -7376,14 +7229,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovsxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovsxbq %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0 -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsxbq %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) @@ -7397,14 +7249,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovsxbq %xmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsxbq %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) @@ -7418,14 +7269,13 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovsxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovsxwd %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsxwd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) @@ -7439,14 +7289,13 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovsxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovsxwd %xmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsxwd %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) @@ -7460,14 +7309,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovsxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovsxwq %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsxwq %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) @@ -7481,14 +7329,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovsxwq %xmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsxwq %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) @@ -7502,15 +7349,14 @@ define <4 x double>@test_int_x86_avx512_mask_perm_df_256(<4 x double> %x0, i8 %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vpermpd $3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 ; CHECK-NEXT: ## ymm0 = ymm0[3,0,0,0] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i8 3, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i8 3, <4 x double> zeroinitializer, i8 %x3) %res2 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i8 3, <4 x double> %x2, i8 -1) @@ -7524,15 +7370,14 @@ define <4 x i64>@test_int_x86_avx512_mask_perm_di_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vpermq $3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpermq $3, %ymm0, %ymm0 +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpermq $3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpermq $3, %ymm0, %ymm0 ; CHECK-NEXT: ## ymm0 = ymm0[3,0,0,0] -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1) @@ -7545,14 +7390,13 @@ define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpermpd %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vpermpd %ymm1, %ymm0, %ymm3 {%k1} {z} -; CHECK-NEXT: vpermpd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermpd %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpermpd %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpermpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3) %res2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) @@ -7566,14 +7410,13 @@ define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpermq %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vpermq %ymm1, %ymm0, %ymm3 {%k1} {z} -; CHECK-NEXT: vpermq %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermq %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpermq %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpermq %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -7589,14 +7432,13 @@ define <8 x float>@test_int_x86_avx512_mask_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpermps %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vpermps %ymm1, %ymm0, %ymm3 {%k1} {z} -; CHECK-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermps %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpermps %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3) %res2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) @@ -7610,14 +7452,13 @@ define <8 x i32>@test_int_x86_avx512_mask_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_256: ; CHECK: ## BB#0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpermd %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vpermd %ymm1, %ymm0, %ymm3 {%k1} {z} -; CHECK-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermd %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpermd %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -279,8 +279,7 @@ ; ; KNL_64-LABEL: test7: ; KNL_64: # BB#0: -; KNL_64-NEXT: movzbl %sil, %eax -; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: kmovw %esi, %k1 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 ; KNL_64-NEXT: kmovw %k1, %k2 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2} @@ -1128,7 +1127,6 @@ ; KNL_64-LABEL: test24: ; KNL_64: # BB#0: ; KNL_64-NEXT: movb $3, %al -; KNL_64-NEXT: movzbl %al, %eax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 @@ -1215,7 +1213,6 @@ ; KNL_64-LABEL: test26: ; KNL_64: # BB#0: ; KNL_64-NEXT: movb $3, %al -; KNL_64-NEXT: movzbl %al, %eax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 @@ -1260,7 +1257,6 @@ ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1 ; KNL_64-NEXT: movb $3, %al -; KNL_64-NEXT: movzbl %al, %eax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} ; KNL_64-NEXT: retq @@ -1271,7 +1267,6 @@ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 ; KNL_32-NEXT: movb $3, %cl -; KNL_32-NEXT: movzbl %cl, %ecx ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} ; KNL_32-NEXT: retl @@ -1297,7 +1292,6 @@ ; KNL_64: # BB#0: ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: movb $3, %al -; KNL_64-NEXT: movzbl %al, %eax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_64-NEXT: retq Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -162,8 +162,7 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: ; AVX512F: # BB#0: -; AVX512F-NEXT: movzbl %dil, %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2 @@ -192,8 +191,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u: ; AVX512F: # BB#0: -; AVX512F-NEXT: movzbl %dil, %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> @@ -223,8 +221,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u: ; AVX512F: # BB#0: -; AVX512F-NEXT: movzbl %dil, %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 @@ -250,8 +247,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0: ; AVX512F: # BB#0: -; AVX512F-NEXT: movzbl %dil, %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] @@ -281,8 +277,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0: ; AVX512F: # BB#0: -; AVX512F-NEXT: movzbl %dil, %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 @@ -312,10 +307,8 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: ; AVX512F: # BB#0: -; AVX512F-NEXT: movzbl %dil, %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: movb $51, %al -; AVX512F-NEXT: movzbl %al, %eax ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k2} {z}