Index: test/CodeGen/X86/avx512-schedule-fma-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512-schedule-fma-intrinsics.ll @@ -0,0 +1,770 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK + +declare <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; CHECK-LABEL: test_x86_vfnmadd_ps_z: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { +; CHECK-LABEL: test_mask_vfnmadd_ps: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfnmadd132ps %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + +define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_x86_vfnmadd_pd_z: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_vfnmadd_pd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfnmadd132pd %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + +define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; CHECK-LABEL: test_x86_vfnmsubps_z: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { +; CHECK-LABEL: test_mask_vfnmsub_ps: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfnmsub132ps %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + +define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_x86_vfnmsubpd_z: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_vfnmsub_pd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfnmsub132pd %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + +define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; CHECK-LABEL: test_x86_vfmaddsubps_z: +; CHECK: # BB#0: +; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; CHECK-LABEL: test_mask_fmaddsub_ps: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddsub132ps %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_x86_vfmaddsubpd_z: +; CHECK: # BB#0: +; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_vfmaddsub_pd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddsub132pd %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub132pd %zmm1, %zmm2, %zmm3 {%k1} +; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm2, %zmm3 +; CHECK-NEXT: vfmaddsub231pd %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm1, %zmm3 +; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +define <16 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub132ps %zmm1, %zmm2, %zmm3 {%k1} +; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmaddsub231ps %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm2, %zmm3 +; CHECK-NEXT: vfmsubadd231pd %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsubadd231ps %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132ps {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132ps {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132ps {rz-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132ps %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} + +declare <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm2, %zmm3 +; CHECK-NEXT: vfmsub231pd %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask3_vfmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsub231ps %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132pd {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132pd {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132pd {rz-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 +; CHECK-NEXT: vfmadd132pd %zmm1, %zmm2, %zmm3 {%k1} +; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm2, %zmm3 +; CHECK-NEXT: vfmadd231pd %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm1, %zmm3 +; CHECK-NEXT: vfmadd213pd %zmm2, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +define <16 x float>@test_int_x86_avx512_mask_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd132ps %zmm1, %zmm2, %zmm3 {%k1} +; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask3_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmadd231ps %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_maskz_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vfmadd213ps %zmm2, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfnmsub132pd {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfnmsub132pd {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfnmsub132pd {rz-sae}, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfnmsub132pd %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 +; CHECK-NEXT: vfnmsub132pd %zmm1, %zmm2, %zmm3 {%k1} +; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm2, %zmm3 +; CHECK-NEXT: vfnmsub231pd %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +define <16 x float>@test_int_x86_avx512_mask_vfnmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmsub132ps %zmm1, %zmm2, %zmm3 {%k1} +; CHECK-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfnmsub231ps %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 +; CHECK-NEXT: vfnmadd132pd %zmm1, %zmm2, %zmm3 {%k1} +; CHECK-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +define <16 x float>@test_int_x86_avx512_mask_vfnmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmadd132ps %zmm1, %zmm2, %zmm3 {%k1} +; CHECK-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} Index: test/CodeGen/X86/avx512-schedule-gather-scatter-intrin.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512-schedule-gather-scatter-intrin.ll @@ -0,0 +1,875 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK + +declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32) +declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32) +declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32) +declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32) + +declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32) +declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32) +declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32) +declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32) + +define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dps: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: kmovq %k1, %k2 +; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) + %ind2 = add <16 x i32> %ind, + call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4) + ret void +} + +define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dpd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: kmovq %k1, %k2 +; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) + %ind2 = add <8 x i32> %ind, + call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4) + ret void +} + +define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qps: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: kmovq %k1, %k2 +; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4) + ret void +} + +define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qpd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: kmovq %k1, %k2 +; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4) + ret void +} +;; +;; Integer Gather/Scatter +;; +declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32) +declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32) +declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32) +declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32) + +declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32) +declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32) +declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32) +declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32) + +define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: kmovq %k1, %k2 +; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) + %ind2 = add <16 x i32> %ind, + call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4) + ret void +} + +define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: kmovq %k1, %k2 +; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4) + ret void +} + +define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qq: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: kmovq %k1, %k2 +; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4) + ret void +} + +define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dq: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: kmovq %k1, %k2 +; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) + %ind2 = add <8 x i32> %ind, + call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4) + ret void +} + +define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { +; CHECK-LABEL: gather_mask_dpd_execdomain: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1} +; CHECK-NEXT: vmovapd %zmm1, (%rdx) +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) + store <8 x double> %x, <8 x double>* %stbuf + ret void +} + +define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { +; CHECK-LABEL: gather_mask_qpd_execdomain: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1} +; CHECK-NEXT: vmovapd %zmm1, (%rdx) +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) + store <8 x double> %x, <8 x double>* %stbuf + ret void +} + +define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) { +; CHECK-LABEL: gather_mask_dps_execdomain: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) + ret <16 x float> %res; +} + +define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) { +; CHECK-LABEL: gather_mask_qps_execdomain: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) + ret <8 x float> %res; +} + +define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: scatter_mask_dpd_execdomain: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm1 # sched: [5:0.50] +; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = load <8 x double>, <8 x double>* %src, align 64 + call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4) + ret void +} + +define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: scatter_mask_qpd_execdomain: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm1 # sched: [5:0.50] +; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = load <8 x double>, <8 x double>* %src, align 64 + call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4) + ret void +} + +define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: scatter_mask_dps_execdomain: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %zmm1 # sched: [5:0.50] +; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = load <16 x float>, <16 x float>* %src, align 64 + call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4) + ret void +} + +define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: scatter_mask_qps_execdomain: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %ymm1 # sched: [1:0.50] +; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = load <8 x float>, <8 x float>* %src, align 32 + call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4) + ret void +} + +define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_qps: +; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.50] +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4) + ret void +} + +declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32); +declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32); +define void @prefetch(<8 x i64> %ind, i8* %base) { +; CHECK-LABEL: prefetch: +; CHECK: # BB#0: +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: movb $1, %al # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: movb $120, %al # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 3) + call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 2) + call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 3) + call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 2) + ret void +} + +declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32) + +define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div2_df: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32) + +define <2 x i64>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div2_di: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1} +; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) + %res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32) + +define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_df: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1} +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32) + +define <4 x i64>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_di: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1} +; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8) + %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32) + +define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_si: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2} +; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32) + +define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div8_si: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa %xmm0, %xmm2 # sched: [1:0.25] +; CHECK-NEXT: kmovq %k1, %k2 +; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2} +; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32) + +define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32) + +define <2 x i64>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1} +; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res1 = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32) + +define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1} +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) + %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32) + +define <4 x i64>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1} +; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32) + +define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2} +; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 2) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32) + +define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1} +; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4) + %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 2) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa %ymm0, %ymm2 # sched: [1:0.25] +; CHECK-NEXT: kmovq %k1, %k2 +; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2} +; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1} +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4) + %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 2) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i32) + +define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2} +; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2) + call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32) + +define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2) + call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i32) + +define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2) + call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32) + +define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2) + call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i32) + +define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2) + call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32) + +define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2} +; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2) + call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i32) + +define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2) + call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32) + +define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2) + call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i32) + +define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2} +; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32) + +define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2} +; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i32) + +define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32) + +define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2} +; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i32) + +define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32) + +define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i32) + +define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32) + +define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4) + ret void +} + +define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) { +; CHECK-LABEL: scatter_mask_test: +; CHECK: # BB#0: +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: movb $1, %al # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: movb $96, %al # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4) + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4) + ret void +} + +define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) { +; CHECK-LABEL: gather_mask_test: +; CHECK: # BB#0: +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1} +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 +; CHECK-NEXT: movw $1, %ax # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} +; CHECK-NEXT: movw $220, %ax # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) + %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4) + %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4) + + %res4 = fadd <16 x float> %res, %res1 + %res5 = fadd <16 x float> %res3, %res2 + %res6 = fadd <16 x float> %res5, %res4 + ret <16 x float> %res6 +} Index: test/CodeGen/X86/avx512-schedule-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512-schedule-intrinsics.ll @@ -0,0 +1,5250 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK + +declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone +define i32 @test_kortestz(i16 %a0, i16 %a1) { +; CHECK-LABEL: test_kortestz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k0 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: xorl %eax, %eax # sched: [1:0.25] +; CHECK-NEXT: kortestw %k0, %k1 +; CHECK-NEXT: sete %al # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone +define i32 @test_kortestc(i16 %a0, i16 %a1) { +; CHECK-LABEL: test_kortestc: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k0 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: xorl %eax, %eax # sched: [1:0.25] +; CHECK-NEXT: kortestw %k0, %k1 +; CHECK-NEXT: setb %al # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1) + ret i32 %res +} + +declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone +define i16 @test_kand(i16 %a0, i16 %a1) { +; CHECK-LABEL: test_kand: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k0 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: movw $8, %ax # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k2 +; CHECK-NEXT: kandw %k0, %k1, %k0 +; CHECK-NEXT: kandw %k0, %k2, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8) + %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1) + ret i16 %t2 +} + +declare i16 @llvm.x86.avx512.kandn.w(i16, i16) nounwind readnone +define i16 @test_kandn(i16 %a0, i16 %a1) { +; CHECK-LABEL: test_kandn: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k0 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: movw $8, %ax # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k2 +; CHECK-NEXT: kandnw %k2, %k1, %k1 +; CHECK-NEXT: kandnw %k0, %k1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8) + %t2 = call i16 @llvm.x86.avx512.kandn.w(i16 %t1, i16 %a1) + ret i16 %t2 +} + +declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone +define i16 @test_knot(i16 %a0) { +; CHECK-LABEL: test_knot: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k0 +; CHECK-NEXT: knotw %k0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.kor.w(i16, i16) nounwind readnone +define i16 @test_kor(i16 %a0, i16 %a1) { +; CHECK-LABEL: test_kor: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k0 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: movw $8, %ax # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k2 +; CHECK-NEXT: korw %k0, %k1, %k0 +; CHECK-NEXT: korw %k0, %k2, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %t1 = call i16 @llvm.x86.avx512.kor.w(i16 %a0, i16 8) + %t2 = call i16 @llvm.x86.avx512.kor.w(i16 %t1, i16 %a1) + ret i16 %t2 +} + +declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone + +define i16 @unpckbw_test(i16 %a0, i16 %a1) { +; CHECK-LABEL: unpckbw_test: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k0 +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: kunpckbw %k1, %k0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone +define i16 @test_kxnor(i16 %a0, i16 %a1) { +; CHECK-LABEL: test_kxnor: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k0 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: movw $8, %ax # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k2 +; CHECK-NEXT: kxorw %k0, %k1, %k0 +; CHECK-NEXT: kxorw %k0, %k2, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %t1 = call i16 @llvm.x86.avx512.kxnor.w(i16 %a0, i16 8) + %t2 = call i16 @llvm.x86.avx512.kxnor.w(i16 %t1, i16 %a1) + ret i16 %t2 +} + +declare i16 @llvm.x86.avx512.kxor.w(i16, i16) nounwind readnone +define i16 @test_kxor(i16 %a0, i16 %a1) { +; CHECK-LABEL: test_kxor: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k0 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: movw $8, %ax # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k2 +; CHECK-NEXT: kxorw %k0, %k1, %k0 +; CHECK-NEXT: kxorw %k0, %k2, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %t1 = call i16 @llvm.x86.avx512.kxor.w(i16 %a0, i16 8) + %t2 = call i16 @llvm.x86.avx512.kxor.w(i16 %t1, i16 %a1) + ret i16 %t2 +} + +define <16 x float> @test_rcp_ps_512(<16 x float> %a0) { +; CHECK-LABEL: test_rcp_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: vrcp14ps %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone + +define <8 x double> @test_rcp_pd_512(<8 x double> %a0) { +; CHECK-LABEL: test_rcp_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: vrcp14pd %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1] + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone + +declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32) + +define <8 x double> @test7(<8 x double> %a) { +; CHECK-LABEL: test7: +; CHECK: # BB#0: +; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4) + ret <8 x double>%res +} + +declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32) + +define <16 x float> @test8(<16 x float> %a) { +; CHECK-LABEL: test8: +; CHECK: # BB#0: +; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4) + ret <16 x float>%res +} + +define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) { +; CHECK-LABEL: test_rsqrt_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: vrsqrt14ps %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone + +define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) { +; CHECK-LABEL: test_sqrt_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) { +; CHECK-LABEL: test_sqrt_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: vsqrtps %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) { +; CHECK-LABEL: test_sqrt_round_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <8 x double> @test_getexp_pd_512(<8 x double> %a0) { +; CHECK-LABEL: test_getexp_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: vgetexppd %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) + ret <8 x double> %res +} +define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) { +; CHECK-LABEL: test_getexp_round_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: vgetexppd {sae}, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <16 x float> @test_getexp_ps_512(<16 x float> %a0) { +; CHECK-LABEL: test_getexp_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: vgetexpps %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) { +; CHECK-LABEL: test_getexp_round_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: vgetexpps {sae}, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_sqrt_ss: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2 # sched: [4:0.50] +; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1) + %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2) + %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3) + + %res.1 = fadd <4 x float> %res0, %res1 + %res.2 = fadd <4 x float> %res2, %res3 + %res = fadd <4 x float> %res.1, %res.2 + ret <4 x float> %res +} + +declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_sqrt_sd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %xmm2, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2 # sched: [4:0.50] +; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1) + %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2) + %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3) + + %res.1 = fadd <2 x double> %res0, %res1 + %res.2 = fadd <2 x double> %res2, %res3 + %res = fadd <2 x double> %res.1, %res.2 + ret <2 x double> %res +} + +define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) { +; CHECK-LABEL: test_x86_sse2_cvtsd2si64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsd2si %xmm0, %rax # sched: [6:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; [#uses=1] + ret i64 %res +} +declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone + +define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) { +; CHECK-LABEL: test_x86_sse2_cvtsi642sd: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone + +define i64 @test_x86_avx512_cvttsd2si64(<2 x double> %a0) { +; CHECK-LABEL: test_x86_avx512_cvttsd2si64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00] +; CHECK-NEXT: vcvttsd2si {sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 4) ; + %res1 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 8) ; + %res2 = add i64 %res0, %res1 + ret i64 %res2 +} +declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) { +; CHECK-LABEL: test_x86_avx512_cvttsd2usi: +; CHECK: # BB#0: +; CHECK-NEXT: vcvttsd2usi %xmm0, %ecx +; CHECK-NEXT: vcvttsd2usi {sae}, %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ; + %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ; + %res2 = add i32 %res0, %res1 + ret i32 %res2 +} +declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) { +; CHECK-LABEL: test_x86_avx512_cvttsd2si: +; CHECK: # BB#0: +; CHECK-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00] +; CHECK-NEXT: vcvttsd2si {sae}, %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ; + %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ; + %res2 = add i32 %res0, %res1 + ret i32 %res2 +} +declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone + + + +define i64 @test_x86_avx512_cvttsd2usi64(<2 x double> %a0) { +; CHECK-LABEL: test_x86_avx512_cvttsd2usi64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvttsd2usi %xmm0, %rcx +; CHECK-NEXT: vcvttsd2usi {sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 4) ; + %res1 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 8) ; + %res2 = add i64 %res0, %res1 + ret i64 %res2 +} +declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) nounwind readnone + +define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) { +; CHECK-LABEL: test_x86_sse_cvtss2si64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtss2si %xmm0, %rax # sched: [6:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; [#uses=1] + ret i64 %res +} +declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone + + +define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) { +; CHECK-LABEL: test_x86_sse_cvtsi642ss: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone + + +define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) { +; CHECK-LABEL: test_x86_avx512_cvttss2si: +; CHECK: # BB#0: +; CHECK-NEXT: vcvttss2si {sae}, %xmm0, %ecx +; CHECK-NEXT: vcvttss2si %xmm0, %eax # sched: [3:1.00] +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ; + %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ; + %res2 = add i32 %res0, %res1 + ret i32 %res2 +} +declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone + +define i64 @test_x86_avx512_cvttss2si64(<4 x float> %a0) { +; CHECK-LABEL: test_x86_avx512_cvttss2si64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00] +; CHECK-NEXT: vcvttss2si {sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 4) ; + %res1 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 8) ; + %res2 = add i64 %res0, %res1 + ret i64 %res2 +} +declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) { +; CHECK-LABEL: test_x86_avx512_cvttss2usi: +; CHECK: # BB#0: +; CHECK-NEXT: vcvttss2usi {sae}, %xmm0, %ecx +; CHECK-NEXT: vcvttss2usi %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ; + %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ; + %res2 = add i32 %res0, %res1 + ret i32 %res2 +} +declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone + +define i64 @test_x86_avx512_cvttss2usi64(<4 x float> %a0) { +; CHECK-LABEL: test_x86_avx512_cvttss2usi64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvttss2usi %xmm0, %rcx +; CHECK-NEXT: vcvttss2usi {sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 4) ; + %res1 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 8) ; + %res2 = add i64 %res0, %res1 + ret i64 %res2 +} +declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone + +define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) { +; CHECK-LABEL: test_x86_avx512_cvtsd2usi64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsd2usi %xmm0, %rax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx # sched: [1:0.25] +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + + %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4) + %res1 = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 3) + %res2 = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 1) + %res3 = add i64 %res, %res1 + %res4 = add i64 %res3, %res2 + ret i64 %res4 +} +declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone + +define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) { +; CHECK-LABEL: test_x86_avx512_cvtsd2si64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsd2si %xmm0, %rax # sched: [6:1.00] +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx # sched: [1:0.25] +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + + %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4) + %res1 = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 3) + %res2 = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 1) + %res3 = add i64 %res, %res1 + %res4 = add i64 %res3, %res2 + ret i64 %res4 +} +declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone + +define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) { +; CHECK-LABEL: test_x86_avx512_cvtss2usi64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtss2usi %xmm0, %rax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx # sched: [1:0.25] +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + + %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4) + %res1 = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 3) + %res2 = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 1) + %res3 = add i64 %res, %res1 + %res4 = add i64 %res3, %res2 + ret i64 %res4 +} +declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone + +define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) { +; CHECK-LABEL: test_x86_avx512_cvtss2si64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtss2si %xmm0, %rax # sched: [6:1.00] +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx # sched: [1:0.25] +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + + %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4) + %res1 = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 3) + %res2 = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 1) + %res3 = add i64 %res, %res1 + %res4 = add i64 %res3, %res2 + ret i64 %res4 +} +declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) { +; CHECK-LABEL: test_x86_avx512_cvtsd2usi32: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsd2usi %xmm0, %eax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx # sched: [1:0.25] +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + + %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4) + %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 3) + %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 1) + %res3 = add i32 %res, %res1 + %res4 = add i32 %res3, %res2 + ret i32 %res4 +} +declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) { +; CHECK-LABEL: test_x86_avx512_cvtsd2si32: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsd2si %xmm0, %eax # sched: [6:1.00] +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx # sched: [1:0.25] +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + + %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4) + %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 3) + %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 1) + %res3 = add i32 %res, %res1 + %res4 = add i32 %res3, %res2 + ret i32 %res4 +} +declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) { +; CHECK-LABEL: test_x86_avx512_cvtss2usi32: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtss2usi %xmm0, %eax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx # sched: [1:0.25] +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + + %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4) + %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 3) + %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 1) + %res3 = add i32 %res, %res1 + %res4 = add i32 %res3, %res2 + ret i32 %res4 +} +declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) { +; CHECK-LABEL: test_x86_avx512_cvtss2si32: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtss2si %xmm0, %eax # sched: [6:1.00] +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx # sched: [1:0.25] +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + + %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4) + %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 3) + %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 1) + %res3 = add i32 %res, %res1 + %res4 = add i32 %res3, %res2 + ret i32 %res4 +} +declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone + +define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) { +; CHECK-LABEL: test_x86_vcvtph2ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) { +; CHECK-LABEL: test_x86_vcvtph2ps_512_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_x86_vcvtph2ps_512_rrk: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) { +; CHECK-LABEL: test_x86_vcvtph2ps_512_sae_rrkz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) { +; CHECK-LABEL: test_x86_vcvtph2ps_512_rrkz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly + +define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, <16 x i16> * %dst) { +; CHECK-LABEL: test_x86_vcvtps2ph_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 # sched: [1:0.50] +; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi) +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask) + %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> %src, i16 %mask) + store <16 x i16> %res1, <16 x i16> * %dst + %res = add <16 x i16> %res2, %res3 + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly + +define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) { +; CHECK-LABEL: test_x86_vbroadcast_ss_512: +; CHECK: # BB#0: +; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8* %a0) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8*) nounwind readonly + +define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) { +; CHECK-LABEL: test_x86_vbroadcast_sd_512: +; CHECK: # BB#0: +; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1] + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly + + define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: test_cmpps: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8) + ret i16 %res + } + declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32) + + define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: test_cmppd: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4) + ret i8 %res + } + declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) + + ; fp min - max +define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) { +; CHECK-LABEL: test_vmaxpd: +; CHECK: # BB#0: +; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double>zeroinitializer, i8 -1, i32 4) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>, + <8 x double>, i8, i32) + +define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) { +; CHECK-LABEL: test_vminpd: +; CHECK: # BB#0: +; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double>zeroinitializer, i8 -1, i32 4) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>, + <8 x double>, i8, i32) + +define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) { +; CHECK-LABEL: test_vptestmq: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addb %cl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1) + %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m) + %res2 = add i8 %res1, %res + ret i8 %res2 +} +declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8) + +define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) { +; CHECK-LABEL: test_vptestmd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1) + %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m) + %res2 = add i16 %res1, %res + ret i16 %res2 +} +declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16) + +define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) { +; CHECK-LABEL: test_mask_store_ss: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovss %xmm0, (%rdi) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.store.ss(i8* %ptr, <4 x float> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 ) + +declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vsubps_rn: +; CHECK: # BB#0: +; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} + +define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vsubps_rd: +; CHECK: # BB#0: +; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} + +define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vsubps_ru: +; CHECK: # BB#0: +; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vsubps_rz: +; CHECK: # BB#0: +; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vmulps_rn: +; CHECK: # BB#0: +; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vmulps_rd: +; CHECK: # BB#0: +; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vmulps_ru: +; CHECK: # BB#0: +; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vmulps_rz: +; CHECK: # BB#0: +; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +;; mask float +define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_vmulps_mask_rn: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 0) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_vmulps_mask_rd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 1) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_vmulps_mask_ru: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_vmulps_mask_rz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 3) + ret <16 x float> %res +} + +;; With Passthru value +define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) { +; CHECK-LABEL: test_vmulps_mask_passthru_rn: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 0) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) { +; CHECK-LABEL: test_vmulps_mask_passthru_rd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 1) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) { +; CHECK-LABEL: test_vmulps_mask_passthru_ru: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) { +; CHECK-LABEL: test_vmulps_mask_passthru_rz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 3) + ret <16 x float> %res +} + +;; mask double +define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_vmulpd_mask_rn: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 0) + ret <8 x double> %res +} + +define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_vmulpd_mask_rd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 1) + ret <8 x double> %res +} + +define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_vmulpd_mask_ru: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 2) + ret <8 x double> %res +} + +define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_vmulpd_mask_rz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 3) + ret <8 x double> %res +} + +define <8 x i64> @test_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) { +; CHECK-LABEL: test_mul_epi32_rr: +; CHECK: # BB#0: +; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) { +; CHECK-LABEL: test_mul_epi32_rrk: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mul_epi32_rrkz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) { +; CHECK-LABEL: test_mul_epi32_rm: +; CHECK: # BB#0: +; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) { +; CHECK-LABEL: test_mul_epi32_rmk: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mul_epi32_rmkz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) { +; CHECK-LABEL: test_mul_epi32_rmb: +; CHECK: # BB#0: +; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i64, i64* %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) { +; CHECK-LABEL: test_mul_epi32_rmbk: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i64, i64* %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mul_epi32_rmbkz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i64, i64* %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>) + +define <8 x i64> @test_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) { +; CHECK-LABEL: test_mul_epu32_rr: +; CHECK: # BB#0: +; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) { +; CHECK-LABEL: test_mul_epu32_rrk: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mul_epu32_rrkz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) { +; CHECK-LABEL: test_mul_epu32_rm: +; CHECK: # BB#0: +; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) { +; CHECK-LABEL: test_mul_epu32_rmk: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mul_epu32_rmkz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) { +; CHECK-LABEL: test_mul_epu32_rmb: +; CHECK: # BB#0: +; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i64, i64* %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) { +; CHECK-LABEL: test_mul_epu32_rmbk: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i64, i64* %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mul_epu32_rmbkz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i64, i64* %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>) + +define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_add_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_add_round_ps_rn_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_add_round_ps_rz_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_add_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_add_round_ps_rn_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_add_round_ps_rd_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_add_round_ps_ru_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_add_round_ps_rz_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_add_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_sub_round_ps_rz_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_sub_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_sub_round_ps_rn_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_sub_round_ps_rz_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_sub_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: vsubps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_div_round_ps_rn_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_div_round_ps_rz_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_div_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_div_round_ps_rn_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_div_round_ps_rz_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_div_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_div_round_ps_rn_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_div_round_ps_rd_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_div_round_ps_ru_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_div_round_ps_rz_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_div_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_min_round_ps_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_min_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_min_round_ps_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_min_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_min_round_ps_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_min_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_max_round_ps_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_maskz_max_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_max_round_ps_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; CHECK-LABEL: test_mm512_mask_max_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_max_round_ps_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_max_round_ps_current: +; CHECK: # BB#0: +; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_ss_rn: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 0) + ret <4 x float> %res +} + +define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_ss_rd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1) + ret <4 x float> %res +} + +define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_ss_ru: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 2) + ret <4 x float> %res +} + +define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_ss_rz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 3) + ret <4 x float> %res +} + +define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_ss_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_add_ss_rn: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 0) + ret <4 x float> %res +} + +define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) { +; CHECK-LABEL: test_add_ss_rn: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 0) + ret <4 x float> %res +} + +define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_ss_current_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %a1.val = load float, float* %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, float* %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_add_ss_current_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %a1.val = load float, float* %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4) + ret <4 x float> %res +} + +declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_sd_rn: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 0) + ret <2 x double> %res +} + +define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_sd_rd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1) + ret <2 x double> %res +} + +define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_sd_ru: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 2) + ret <2 x double> %res +} + +define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_sd_rz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 3) + ret <2 x double> %res +} + +define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_sd_current: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_add_sd_rn: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 0) + ret <2 x double> %res +} + +define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_add_sd_rn: +; CHECK: # BB#0: +; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 0) + ret <2 x double> %res +} + +define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_sd_current_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %a1.val = load double, double* %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, double* %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_add_sd_current_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %a1.val = load double, double* %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4) + ret <2 x double> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_max_ss_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_max_ss_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8) + ret <4 x float> %res +} + +define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) { +; CHECK-LABEL: test_max_ss_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8) + ret <4 x float> %res +} + +define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_max_ss: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_max_ss: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) { +; CHECK-LABEL: test_max_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_max_ss_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %a1.val = load float, float* %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, float* %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_max_ss_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %a1.val = load float, float* %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4) + ret <4 x float> %res +} +declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_max_sd_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_max_sd_sae: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8) + ret <2 x double> %res +} + +define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_max_sd_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8) + ret <2 x double> %res +} + +define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_max_sd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_max_sd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_max_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_max_sd_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %a1.val = load double, double* %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_max_sd_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %a1.val = load double, double* %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_x86_avx512_cvtsi2sd64(<2 x double> %a, i64 %b) { +; CHECK-LABEL: test_x86_avx512_cvtsi2sd64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsi2sdq %rdi, {rz-sae}, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double> %a, i64 %b, i32 3) ; <<<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double>, i64, i32) nounwind readnone + +define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) { +; CHECK-LABEL: test_x86_avx512_cvtsi2ss32: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsi2ssl %edi, {rz-sae}, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 3) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone + +define <4 x float> @test_x86_avx512_cvtsi2ss64(<4 x float> %a, i64 %b) { +; CHECK-LABEL: test_x86_avx512_cvtsi2ss64: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtsi2ssq %rdi, {rz-sae}, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float> %a, i64 %b, i32 3) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float>, i64, i32) nounwind readnone + +define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b) +; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtusi2ssl %edi, {rd-sae}, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] +{ + %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} + +define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, i32* %ptr) +; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem: +; CHECK: # BB#0: +; CHECK-NEXT: movl (%rdi), %eax # sched: [1:0.50] +; CHECK-NEXT: vcvtusi2ssl %eax, {rd-sae}, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] +{ + %b = load i32, i32* %ptr + %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} + +define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b) +; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] +{ + %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} + +define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, i32* %ptr) +; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss_mem: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtusi2ssl (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] +{ + %b = load i32, i32* %ptr + %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone + +define <4 x float> @_mm_cvt_roundu64_ss (<4 x float> %a, i64 %b) +; CHECK-LABEL: _mm_cvt_roundu64_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtusi2ssq %rdi, {rd-sae}, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] +{ + %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 1) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} + +define <4 x float> @_mm_cvtu64_ss(<4 x float> %a, i64 %b) +; CHECK-LABEL: _mm_cvtu64_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtusi2ssq %rdi, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] +{ + %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 4) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float>, i64, i32) nounwind readnone + +define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b) +; CHECK-LABEL: test_x86_avx512_mm_cvtu32_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] +{ + %res = call <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double> %a, i32 %b) ; <<<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double>, i32) nounwind readnone + +define <2 x double> @test_x86_avx512_mm_cvtu64_sd(<2 x double> %a, i64 %b) +; CHECK-LABEL: test_x86_avx512_mm_cvtu64_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtusi2sdq %rdi, {rd-sae}, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] +{ + %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 1) ; <<<2 x double>> [#uses=1] + ret <2 x double> %res +} + +define <2 x double> @test_x86_avx512__mm_cvt_roundu64_sd(<2 x double> %a, i64 %b) +; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu64_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vcvtusi2sdq %rdi, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] +{ + %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 4) ; <<<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double>, i64, i32) nounwind readnone + +declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermi2d (%rdi), %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %x2 = load <16 x i32>, <16 x i32>* %x2p + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm1, %zmm3 +; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vaddpd %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm2 +; CHECK-NEXT: vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpermt2d %zmm1, %zmm0, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %x2 = load <16 x i32>, <16 x i32>* %x2p + %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x1, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovapd %zmm1, %zmm2 +; CHECK-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %x2s = load double, double* %x2ptr + %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 + %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer + %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) + %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x1, i8 -1) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} +; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) + %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + + +declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpaddd %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) +define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 3) + %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 2) + %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovqb %zmm0, (%rdi) +; CHECK-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovsqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovsqb %zmm0, (%rdi) +; CHECK-NEXT: vpmovsqb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovusqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovusqb %zmm0, (%rdi) +; CHECK-NEXT: vpmovusqb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqw %zmm0, %xmm0 +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovqw %zmm0, (%rdi) +; CHECK-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqw %zmm0, %xmm0 +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovsqw %zmm0, (%rdi) +; CHECK-NEXT: vpmovsqw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqw %zmm0, %xmm0 +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovusqw %zmm0, (%rdi) +; CHECK-NEXT: vpmovusqw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovqd %zmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovqd %zmm0, (%rdi) +; CHECK-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsqd %zmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovsqd %zmm0, (%rdi) +; CHECK-NEXT: vpmovsqd %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovusqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovusqd %zmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovusqd %zmm0, (%rdi) +; CHECK-NEXT: vpmovusqd %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovdb %zmm0, (%rdi) +; CHECK-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovsdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovsdb %zmm0, (%rdi) +; CHECK-NEXT: vpmovsdb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovusdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovusdb %zmm0, (%rdi) +; CHECK-NEXT: vpmovusdb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovdw %zmm0, %ymm0 +; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovdw %zmm0, (%rdi) +; CHECK-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovsdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsdw %zmm0, %ymm0 +; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovsdw %zmm0, (%rdi) +; CHECK-NEXT: vpmovsdw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovusdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovusdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovusdw %zmm0, %ymm0 +; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovusdw %zmm0, (%rdi) +; CHECK-NEXT: vpmovusdw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtdq2ps {rn-sae}, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2dq %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtpd2dq {rn-sae}, %zmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32) + +define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2ps %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtpd2ps {ru-sae}, %zmm0, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4) + %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 2) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtpd2udq {rn-sae}, %zmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 2) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32) + +define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtps2dq {rn-sae}, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2pd %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtps2pd {sae}, %ymm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32) + +define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtps2udq {rn-sae}, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtudq2ps %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtudq2ps {rn-sae}, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32) + +define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttps2dq %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvttps2dq {sae}, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32) + +define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttps2udq %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvttps2udq {sae}, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_getexp_ss: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vaddps %xmm5, %xmm4, %xmm4 # sched: [4:0.50] +; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) + %res2 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8) + %res3 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8) + + %res.1 = fadd <4 x float> %res0, %res1 + %res.2 = fadd <4 x float> %res2, %res3 + %res = fadd <4 x float> %res.1, %res.2 + ret <4 x float> %res +} + +declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_getexp_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm3, %xmm5, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) + %res2 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8) + %res3 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4) + + %res.1 = fadd <2 x double> %res0, %res1 + %res.2 = fadd <2 x double> %res2, %res3 + %res = fadd <2 x double> %res.1, %res.2 + ret <2 x double> %res +} + +declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32) + +define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + + %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8) + ret i8 %res4 +} + +define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all: +; CHECK: # BB#0: +; CHECK-NEXT: vcmplesd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %edx +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %esi +; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: orb %cl, %dl # sched: [1:0.25] +; CHECK-NEXT: orb %sil, %al # sched: [1:0.25] +; CHECK-NEXT: orb %dl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + + %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4) + %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8) + %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4) + %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8) + + %res11 = or i8 %res1, %res2 + %res12 = or i8 %res3, %res4 + %res13 = or i8 %res11, %res12 + ret i8 %res13 +} + +declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32) + +define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + + %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4) + ret i8 %res2 +} + + +define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpless %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %edx +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %esi +; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: andb %cl, %dl # sched: [1:0.25] +; CHECK-NEXT: andb %sil, %al # sched: [1:0.25] +; CHECK-NEXT: andb %dl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4) + %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8) + %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4) + %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8) + + %res11 = and i8 %res1, %res2 + %res12 = and i8 %res3, %res4 + %res13 = and i8 %res11, %res12 + ret i8 %res13 +} + +declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm3 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) + %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res3, %res2 + ret <8 x double> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vgetmantpd $11, {sae}, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetmantps $11, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vgetmantps $11, {sae}, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm5, %xmm4, %xmm4 # sched: [4:0.50] +; CHECK-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 8) + %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 -1, i32 4) + %res11 = fadd <2 x double> %res, %res1 + %res12 = fadd <2 x double> %res2, %res3 + %res13 = fadd <2 x double> %res11, %res12 + ret <2 x double> %res13 +} + +declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm2 # sched: [4:0.50] +; CHECK-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 8) + %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 4) + %res11 = fadd <4 x float> %res, %res1 + %res12 = fadd <4 x float> %res2, %res3 + %res13 = fadd <4 x float> %res11, %res12 + ret <4 x float> %res13 +} + +declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>) + +define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512_mask: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovapd %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> %x2 + ret <8 x double> %res2 +} + +define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512_maskz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> zeroinitializer + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>) + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_mask: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2 + ret <16 x float> %res2 +} + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_maskz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer + ret <16 x float> %res2 +} + +; Test case to make sure we can print shuffle decode comments for constant pool loads. +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool: +; CHECK: # BB#0: +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> ) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> ) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2 + ret <16 x float> %res2 +} + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> ) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer + ret <16 x float> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1} # sched: [3:1.00] +; CHECK-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} # sched: [8:1.00] +; CHECK-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0 # sched: [8:1.00] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) + +define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) + +define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) + +define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) + +define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpeqsd {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8) + ret i32 %res +} + +define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpeq_uqsd {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8) + ret i32 %res +} + +define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_x86_avx512_comi_sd_eq: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpeqsd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4) + ret i32 %res +} + +define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpeq_uqsd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4) + ret i32 %res +} + +define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpltsd {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8) + ret i32 %res +} + +define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpngesd {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8) + ret i32 %res +} + +define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_x86_avx512_comi_sd_lt: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpltsd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4) + ret i32 %res +} + +define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpngesd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32) + +define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) { +; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpngess %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32) + +declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 +; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 +; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i32, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprold $3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i32, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + +declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm3 +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) + %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) + %res2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res3, %res2 + ret <8 x double> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm3 +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1} +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + +declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm3 +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vaddps %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) + %res1 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) + %res2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res3, %res2 + ret <16 x float> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm3 +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1} +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 +; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1} +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 # sched: [1:0.50] +; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3 +; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4) + %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res3, %res2 + ret <8 x double> %res4 +} + +declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) + +define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 +; CHECK-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z} +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 # sched: [1:0.50] +; CHECK-NEXT: vmovapd %zmm0, %zmm5 +; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3 +; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4) + %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res3, %res2 + ret <8 x double> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ss: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 # sched: [1:0.50] +; CHECK-NEXT: vmovaps %xmm0, %xmm5 # sched: [1:1.00] +; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res3, %res2 + ret <4 x float> %res4 +} + +declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32) + +define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm0, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4 {%k1} {z} +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8) + %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 4) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res3, %res2 + ret <4 x float> %res4 +} + +declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 # sched: [1:0.50] +; CHECK-NEXT: vmovaps %zmm0, %zmm5 +; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1} +; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3 +; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4) + %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res3, %res2 + ret <16 x float> %res4 +} + +declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32) + +define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: vmovaps %zmm0, %zmm4 +; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm4 {%k1} {z} +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddps %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vaddps %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 8) + %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res3, %res2 + ret <16 x float> %res4 +} + +declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovapd %xmm0, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %xmm0, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4 {%k1} +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) + %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 -1, i32 4) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res3, %res2 + ret <2 x double> %res4 +} + +declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32) + +define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_sd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %xmm0, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z} +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 # sched: [1:0.50] +; CHECK-NEXT: vmovapd %xmm0, %xmm5 # sched: [1:1.00] +; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) + %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res3, %res2 + ret <2 x double> %res4 +} + +declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2) + +define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) + %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + +declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2) + +define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addb %cl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} +; CHECK-NEXT: vpbroadcastd %edi, %zmm2 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res2, %res3 + ret <16 x i32> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16) + +define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} +; CHECK-NEXT: vpbroadcastq %rdi, %zmm2 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res2, %res3 + ret <8 x i64> %res4 +} +declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8) + +declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovapd %xmm0, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %xmm0, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 {%k1} +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: vmovapd %xmm0, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm0, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 {%k1} +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: vmovaps %xmm0, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %xmm0, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z} +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res +} +declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovapd %xmm2, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: vmovapd %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %xmm2, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: vmovaps %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) { +; CHECK-LABEL: fmadd_ss_mask_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50] +; CHECK-NEXT: kmovd %edx, %k1 +; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vmovss %xmm0, (%rdi) # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %a.val = load float, float* %a + %av0 = insertelement <4 x float> undef, float %a.val, i32 0 + %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 + %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 + %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 + + %b.val = load float, float* %b + %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 + %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 + %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 + %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 + + %vr = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4) + + %sr = extractelement <4 x float> %vr, i32 0 + store float %sr, float* %a + ret void +} + +define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) { +; CHECK-LABEL: fmadd_ss_maskz_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50] +; CHECK-NEXT: kmovd %edx, %k1 +; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovss %xmm0, (%rdi) # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %a.val = load float, float* %a + %av0 = insertelement <4 x float> undef, float %a.val, i32 0 + %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 + %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 + %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 + + %b.val = load float, float* %b + %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 + %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 + %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 + %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 + + %vr = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4) + + %sr = extractelement <4 x float> %vr, i32 0 + store float %sr, float* %a + ret void +} + +define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) { +; CHECK-LABEL: fmadd_sd_mask_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [1:0.50] +; CHECK-NEXT: kmovd %edx, %k1 +; CHECK-NEXT: vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %a.val = load double, double* %a + %av0 = insertelement <2 x double> undef, double %a.val, i32 0 + %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 + + %b.val = load double, double* %b + %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 + %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 + + %vr = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4) + + %sr = extractelement <2 x double> %vr, i32 0 + store double %sr, double* %a + ret void +} + +define void @fmadd_sd_maskz_memfold(double* %a, double* %b, i8 %c) { +; CHECK-LABEL: fmadd_sd_maskz_memfold: +; CHECK: # BB#0: +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [1:0.50] +; CHECK-NEXT: kmovd %edx, %k1 +; CHECK-NEXT: vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %a.val = load double, double* %a + %av0 = insertelement <2 x double> undef, double %a.val, i32 0 + %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 + + %b.val = load double, double* %b + %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 + %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 + + %vr = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4) + + %sr = extractelement <2 x double> %vr, i32 0 + store double %sr, double* %a + ret void +} + +declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovapd %xmm2, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: vmovapd %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %xmm2, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: vmovaps %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovapd %xmm2, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: vmovapd %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %xmm2, %xmm3 # sched: [1:1.00] +; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3 # sched: [4:0.50] +; CHECK-NEXT: vmovaps %xmm2, %xmm4 # sched: [1:1.00] +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm4, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load float, float* %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load float, float* %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) + ret < 4 x float> %res +} + + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax # sched: [1:0.25] +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load float, float* %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4) + ret < 4 x float> %res +} + +define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: test_x86_avx512_psll_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psll_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psll_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone + + +define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) { +; CHECK-LABEL: test_x86_avx512_psll_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psll_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psll_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone + + +define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) { +; CHECK-LABEL: test_x86_avx512_pslli_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_pslli_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_pslli_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone + + +define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) { +; CHECK-LABEL: test_x86_avx512_pslli_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_pslli_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_pslli_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone + + +define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) { +; CHECK-LABEL: test_x86_avx512_psra_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psra_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psra_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone + + +define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: test_x86_avx512_psra_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psra_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psra_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone + + + +define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) { +; CHECK-LABEL: test_x86_avx512_psrai_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrai_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrai_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone + + +define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) { +; CHECK-LABEL: test_x86_avx512_psrai_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrai_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrai_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone + + + +define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: test_x86_avx512_psrl_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrl_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrl_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone + + +define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) { +; CHECK-LABEL: test_x86_avx512_psrl_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrl_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrl_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone + + +define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) { +; CHECK-LABEL: test_x86_avx512_psrli_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrli_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrli_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone + + +define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) { +; CHECK-LABEL: test_x86_avx512_psrli_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrli_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrli_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone + +define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) { +; CHECK-LABEL: test_x86_avx512_psllv_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psllv_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2 + ret <16 x i32> %res2 +} + +define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psllv_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone + +define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) { +; CHECK-LABEL: test_x86_avx512_psllv_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psllv_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2 + ret <8 x i64> %res2 +} + +define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psllv_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) { +; CHECK-LABEL: test_x86_avx512_psrav_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrav_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2 + ret <16 x i32> %res2 +} + +define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrav_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) { +; CHECK-LABEL: test_x86_avx512_psrav_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrav_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2 + ret <8 x i64> %res2 +} + +define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrav_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) { +; CHECK-LABEL: test_x86_avx512_psrlv_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrlv_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2 + ret <16 x i32> %res2 +} + +define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) { +; CHECK-LABEL: test_x86_avx512_psrlv_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrlv_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2 + ret <8 x i64> %res2 +} + +define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone + Index: test/CodeGen/X86/avx512bw-schedule-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512bw-schedule-intrinsics.ll @@ -0,0 +1,1414 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK + +define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) { +; CHECK-LABEL: test_mask_packs_epi32_rr_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rrk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rrkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) { +; CHECK-LABEL: test_mask_packs_epi32_rm_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpackssdw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) { +; CHECK-LABEL: test_mask_packs_epi32_rmb_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmbk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmbkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) + +define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_mask_packs_epi16_rr_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + ret <64 x i8> %1 +} + +define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rrk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru + ret <64 x i8> %3 +} + +define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rrkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 +} + +define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_packs_epi16_rm_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpacksswb (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + ret <64 x i8> %1 +} + +define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rmk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vpacksswb (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru + ret <64 x i8> %3 +} + +define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rmkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 +} + +declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) + + +define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) { +; CHECK-LABEL: test_mask_packus_epi32_rr_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rrk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rrkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) { +; CHECK-LABEL: test_mask_packus_epi32_rm_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpackusdw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) { +; CHECK-LABEL: test_mask_packus_epi32_rmb_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmbk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmbkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) + +define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_mask_packus_epi16_rr_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + ret <64 x i8> %1 +} + +define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rrk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru + ret <64 x i8> %3 +} + +define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rrkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 +} + +define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_packus_epi16_rm_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpackuswb (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + ret <64 x i8> %1 +} + +define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rmk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vpackuswb (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru + ret <64 x i8> %3 +} + +define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rmkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 +} + +declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) + +define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vpaddw %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vpaddw %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vpaddw %zmm1, %zmm3, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>) + +define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_b_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpshufb %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1) + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_pshuf_b_512_mask(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %mask) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_b_512_mask: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1) + %mask.cast = bitcast i64 %mask to <64 x i1> + %res2 = select <64 x i1> %mask.cast, <64 x i8> %res, <64 x i8> %x2 + ret <64 x i8> %res2 +} + +define <64 x i8>@test_int_x86_avx512_pshuf_b_512_maskz(<64 x i8> %x0, <64 x i8> %x1, i64 %mask) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_b_512_maskz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpshufb %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1) + %mask.cast = bitcast i64 %mask to <64 x i1> + %res2 = select <64 x i1> %mask.cast, <64 x i8> %res, <64 x i8> zeroinitializer + ret <64 x i8> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmulhuw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmulhw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpmulhw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovwb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 +; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovwb %zmm0, (%rdi) +; CHECK-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + +declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovswb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %zmm0, %ymm0 +; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovswb %zmm0, (%rdi) +; CHECK-NEXT: vpmovswb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + +declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovuswb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %zmm0, %ymm0 +; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovuswb %zmm0, (%rdi) +; CHECK-NEXT: vpmovuswb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + +declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vpaddw %zmm3, %zmm2, %zmm2 +; CHECK-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4) + %res2 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 -1) + %res3 = add <32 x i16> %res, %res1 + %res4 = add <32 x i16> %res3, %res2 + ret <32 x i16> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) + +define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){ +; CHECK-LABEL: test_int_x86_avx512_mask_psadb_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsadbw %zmm1, %zmm0, %zmm1 +; CHECK-NEXT: vpsadbw %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1) + %res1 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32) + +define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { +; CHECK-LABEL: test_int_x86_avx512_kunpck_wd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k0 +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: kunpckwd %k1, %k0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) + ret i32 %res +} + +declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64) + +define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { +; CHECK-LABEL: test_int_x86_avx512_kunpck_qd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rdi, %k0 +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: kunpckdq %k1, %k0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1) + ret i64 %res +} + +declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>) + +define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovb2m %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8> %x0) + ret i64 %res +} + +declare i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16>) + +define i32@test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovw2m %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16> %x0) + ret i32 %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrlv32hi: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvw %zmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddw %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) + %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res3 = add <32 x i16> %res, %res1 + %res4 = add <32 x i16> %res3, %res2 + ret <32 x i16> %res4 +} + +declare <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrav32_hi: +; CHECK: # BB#0: +; CHECK-NEXT: vpsravw %zmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsravw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpsravw %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddw %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) + %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res3 = add <32 x i16> %res, %res1 + %res4 = add <32 x i16> %res3, %res2 + ret <32 x i16> %res4 +} + +define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrav32_hi_const: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] sched: [5:0.50] +; CHECK-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> , + <32 x i16> , + <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psllv32hi: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllvw %zmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddw %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) + %res2 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res3 = add <32 x i16> %res, %res1 + %res4 = add <32 x i16> %res3, %res2 + ret <32 x i16> %res4 +} + +declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; CHECK-NEXT: vpaddw %zmm3, %zmm2, %zmm2 +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) + %res2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res3 = add <32 x i16> %res, %res1 + %res4 = add <32 x i16> %res3, %res2 + ret <32 x i16> %res4 +} + +declare i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8>, <64 x i8>, i64) + +define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_b_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rcx +; CHECK-NEXT: vptestmb %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: addq %rcx, %rax # sched: [1:0.25] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) + %res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1) + %res2 = add i64 %res, %res1 + ret i64 %res2 +} + +declare i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16>, <32 x i16>, i32) + +define i32@test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestmw %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestmw %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) + %res1 = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1) + %res2 = add i32 %res, %res1 + ret i32 %res2 +} + +declare i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8>, <64 x i8>, i64 %x2) + +define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rcx +; CHECK-NEXT: vptestnmb %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: addq %rcx, %rax # sched: [1:0.25] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) + %res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1) + %res2 = add i64 %res, %res1 + ret i64 %res2 +} + +declare i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16>, <32 x i16>, i32 %x2) + +define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestnmw %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) + %res1 = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1) + %res2 = add i32 %res, %res1 + ret i32 %res2 +} + +declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) + +define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vpbroadcastb %edi, %zmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} +; CHECK-NEXT: vpbroadcastb %edi, %zmm2 +; CHECK-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) + %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask) + %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask) + %res3 = add <64 x i8> %res, %res1 + %res4 = add <64 x i8> %res2, %res3 + ret <64 x i8> %res4 +} + +declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpbroadcastw %edi, %zmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} +; CHECK-NEXT: vpbroadcastw %edi, %zmm2 +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask) + %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask) + %res3 = add <32 x i16> %res, %res1 + %res4 = add <32 x i16> %res2, %res3 + ret <32 x i16> %res4 +} + + +define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: test_x86_avx512_psll_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1] + ret <32 x i16> %res +} +define <32 x i16> @test_x86_avx512_mask_psll_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psll_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru + ret <32 x i16> %res2 +} +define <32 x i16> @test_x86_avx512_maskz_psll_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psll_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer + ret <32 x i16> %res2 +} +declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind readnone + + +define <32 x i16> @test_x86_avx512_pslli_w_512(<32 x i16> %a0) { +; CHECK-LABEL: test_x86_avx512_pslli_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1] + ret <32 x i16> %res +} +define <32 x i16> @test_x86_avx512_mask_pslli_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_pslli_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllw $7, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru + ret <32 x i16> %res2 +} +define <32 x i16> @test_x86_avx512_maskz_pslli_w_512(<32 x i16> %a0, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_pslli_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer + ret <32 x i16> %res2 +} +declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readnone + + +define <32 x i16> @test_x86_avx512_psra_w_512(<32 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: test_x86_avx512_psra_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsraw %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1] + ret <32 x i16> %res +} +define <32 x i16> @test_x86_avx512_mask_psra_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psra_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru + ret <32 x i16> %res2 +} +define <32 x i16> @test_x86_avx512_maskz_psra_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psra_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer + ret <32 x i16> %res2 +} +declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind readnone + + +define <32 x i16> @test_x86_avx512_psrai_w_512(<32 x i16> %a0) { +; CHECK-LABEL: test_x86_avx512_psrai_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsraw $7, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1] + ret <32 x i16> %res +} +define <32 x i16> @test_x86_avx512_mask_psrai_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrai_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsraw $7, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru + ret <32 x i16> %res2 +} +define <32 x i16> @test_x86_avx512_maskz_psrai_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrai_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsraw $7, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer + ret <32 x i16> %res2 +} +declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readnone + + +define <32 x i16> @test_x86_avx512_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: test_x86_avx512_psrl_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1] + ret <32 x i16> %res +} +define <32 x i16> @test_x86_avx512_mask_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrl_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru + ret <32 x i16> %res2 +} +define <32 x i16> @test_x86_avx512_maskz_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrl_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer + ret <32 x i16> %res2 +} +declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone + + +define <32 x i16> @test_x86_avx512_psrli_w_512(<32 x i16> %a0) { +; CHECK-LABEL: test_x86_avx512_psrli_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlw $7, %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1] + ret <32 x i16> %res +} +define <32 x i16> @test_x86_avx512_mask_psrli_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_mask_psrli_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlw $7, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru + ret <32 x i16> %res2 +} +define <32 x i16> @test_x86_avx512_maskz_psrli_w_512(<32 x i16> %a0, i32 %mask) { +; CHECK-LABEL: test_x86_avx512_maskz_psrli_w_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlw $7, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1] + %mask.cast = bitcast i32 %mask to <32 x i1> + %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer + ret <32 x i16> %res2 +} +declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) nounwind readnone Index: test/CodeGen/X86/avx512bwvl-schedule-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512bwvl-schedule-intrinsics.ll @@ -0,0 +1,2758 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK + +define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_mask_packs_epi32_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { +; CHECK-LABEL: test_mask_packs_epi32_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { +; CHECK-LABEL: test_mask_packs_epi32_rmb_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmbk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmbkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) + +define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: test_mask_packs_epi32_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { +; CHECK-LABEL: test_mask_packs_epi32_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { +; CHECK-LABEL: test_mask_packs_epi32_rmb_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmbk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_packs_epi32_rmbkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) + +define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_packs_epi16_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + ret <16 x i8> %1 +} + +define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_packs_epi16_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + ret <16 x i8> %1 +} + +define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) + +define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_packs_epi16_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + ret <32 x i8> %1 +} + +define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_packs_epi16_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + ret <32 x i8> %1 +} + +define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_packs_epi16_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) + + +define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_mask_packus_epi32_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { +; CHECK-LABEL: test_mask_packus_epi32_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { +; CHECK-LABEL: test_mask_packus_epi32_rmb_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmbk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmbkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) + +define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: test_mask_packus_epi32_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { +; CHECK-LABEL: test_mask_packus_epi32_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { +; CHECK-LABEL: test_mask_packus_epi32_rmb_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmbk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_packus_epi32_rmbkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) + +define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_packus_epi16_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + ret <16 x i8> %1 +} + +define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_packus_epi16_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + ret <16 x i8> %1 +} + +define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) + +define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_packus_epi16_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + ret <32 x i8> %1 +} + +define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_packus_epi16_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + ret <32 x i8> %1 +} + +define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_packus_epi16_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) + +define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 # sched: [1:0.25] +; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 +; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 # sched: [1:0.25] +; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 +; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} +; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 # sched: [1:0.25] +; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 +; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 # sched: [1:0.25] +; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 +; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 # sched: [1:0.25] +; CHECK-NEXT: vpermi2w %xmm2, %xmm0, %xmm3 +; CHECK-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 # sched: [1:0.25] +; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3 +; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [4:0.33] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 # sched: [4:0.33] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [4:0.33] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovwb %xmm0, (%rdi) +; CHECK-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovswb %xmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovswb %xmm0, (%rdi) +; CHECK-NEXT: vpmovswb %xmm0, (%rdi) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovuswb %xmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovuswb %xmm0, (%rdi) +; CHECK-NEXT: vpmovuswb %xmm0, (%rdi) {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovwb %ymm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovwb %ymm0, (%rdi) +; CHECK-NEXT: vpmovwb %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovswb %ymm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovswb %ymm0, (%rdi) +; CHECK-NEXT: vpmovswb %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmovuswb %ymm0, (%rdi) +; CHECK-NEXT: vpmovuswb %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [4:0.33] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [4:0.33] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %xmm0, %xmm3, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> zeroinitializer, i8 %x4) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 -1) + %res3 = add <8 x i16> %res, %res1 + %res4 = add <8 x i16> %res2, %res3 + ret <8 x i16> %res4 +} + +declare <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8>, <32 x i8>, i32, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm3 +; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> zeroinitializer, i16 %x4) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 -1) + %res3 = add <16 x i16> %res, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8>) + +define i16@test_int_x86_avx512_cvtb2mask_128(<16 x i8> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovb2m %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8> %x0) + ret i16 %res +} + +declare i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8>) + +define i32@test_int_x86_avx512_cvtb2mask_256(<32 x i8> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovb2m %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8> %x0) + ret i32 %res +} + +declare i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16>) + +define i8@test_int_x86_avx512_cvtw2mask_128(<8 x i16> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovw2m %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %x0) + ret i8 %res +} + +declare i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16>) + +define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16> %x0) + ret i16 %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_psrlv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrlv16_hi: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res3 = add <16 x i16> %res, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_hi: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res3 = add <8 x i16> %res, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_psrav16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrav16_hi: +; CHECK: # BB#0: +; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res3 = add <16 x i16> %res, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_psrav8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_hi: +; CHECK: # BB#0: +; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res3 = add <8 x i16> %res, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_psllv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psllv16_hi: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res3 = add <16 x i16> %res, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_hi: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res3 = add <8 x i16> %res, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm3 +; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} +; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res3 = add <8 x i16> %res, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm3 +; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} +; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res3 = add <16 x i16> %res, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8>, <16 x i8>, i16) + +define i16@test_int_x86_avx512_ptestm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_b_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) + %res1 = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + +declare i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8>, <32 x i8>, i32) + +define i32@test_int_x86_avx512_ptestm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_b_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) + %res1 = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1) + %res2 = add i32 %res, %res1 + ret i32 %res2 +} + +declare i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16>, <8 x i16>, i8) + +define i8@test_int_x86_avx512_ptestm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_w_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addb %cl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16>, <16 x i16>, i16) + +define i16@test_int_x86_avx512_ptestm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestm_w_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) + %res1 = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + +declare i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8>, <16 x i8>, i16) + +define i16@test_int_x86_avx512_ptestnm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) + %res1 = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + +declare i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8>, <32 x i8>, i32) + +define i32@test_int_x86_avx512_ptestnm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) + %res1 = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1) + %res2 = add i32 %res, %res1 + ret i32 %res2 +} + +declare i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16>, <8 x i16>, i8 %x2) + +define i8@test_int_x86_avx512_ptestnm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addb %cl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16>, <16 x i16>, i16 %x2) + +define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) + %res1 = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + +declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpbroadcastb %edi, %ymm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} +; CHECK-NEXT: vpbroadcastb %edi, %ymm2 +; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask) + %res3 = add <32 x i8> %res, %res1 + %res4 = add <32 x i8> %res2, %res3 + ret <32 x i8> %res4 +} + +declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpbroadcastb %edi, %xmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} +; CHECK-NEXT: vpbroadcastb %edi, %xmm2 +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask) + %res3 = add <16 x i8> %res, %res1 + %res4 = add <16 x i8> %res2, %res3 + ret <16 x i8> %res4 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpbroadcastw %edi, %ymm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} +; CHECK-NEXT: vpbroadcastw %edi, %ymm2 +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask) + %res3 = add <16 x i16> %res, %res1 + %res4 = add <16 x i16> %res2, %res3 + ret <16 x i16> %res4 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpbroadcastw %edi, %xmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} +; CHECK-NEXT: vpbroadcastw %edi, %xmm2 +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask) + %res3 = add <8 x i16> %res, %res1 + %res4 = add <8 x i16> %res2, %res3 + ret <8 x i16> %res4 +} Index: test/CodeGen/X86/avx512cd-schedule-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512cd-schedule-intrinsics.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK + +define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) { +; CHECK-LABEL: test_x86_vbroadcastmw_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k0 +; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) + ret <16 x i32> %res +} +declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16) + +define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) { +; CHECK-LABEL: test_x86_broadcastmb_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k0 +; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) + ret <8 x i64> %res +} +declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8) + +declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly + +define <8 x i64> @test_conflict_q(<8 x i64> %a) { +; CHECK-LABEL: test_conflict_q: +; CHECK: # BB#0: +; CHECK-NEXT: vpconflictq %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly + +define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) { +; CHECK-LABEL: test_maskz_conflict_d: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpconflictd %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_conflict_q: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpconflictq %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) + ret <8 x i64> %res +} + +define <16 x i32> @test_lzcnt_d(<16 x i32> %a) { +; CHECK-LABEL: test_lzcnt_d: +; CHECK: # BB#0: +; CHECK-NEXT: vplzcntd %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false) + ret <16 x i32> %1 +} +declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) #0 + +define <8 x i64> @test_lzcnt_q(<8 x i64> %a) { +; CHECK-LABEL: test_lzcnt_q: +; CHECK: # BB#0: +; CHECK-NEXT: vplzcntq %zmm0, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false) + ret <8 x i64> %1 +} +declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) #0 + +define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_mask_lzcnt_d: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vplzcntd %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %b + ret <16 x i32> %3 +} + +define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_lzcnt_q: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %b + ret <8 x i64> %3 +} Index: test/CodeGen/X86/avx512cdvl-schedule-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512cdvl-schedule-intrinsics.ll @@ -0,0 +1,192 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK + +define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128: +; CHECK: # BB#0: +; CHECK-NEXT: vplzcntd %xmm0, %xmm2 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # sched: [1:0.50] +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) + %2 = bitcast i8 %x2 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x1 + %4 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) + %5 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) + %6 = bitcast i8 %x2 to <8 x i1> + %extract = shufflevector <8 x i1> %6, <8 x i1> %6, <4 x i32> + %7 = select <4 x i1> %extract, <4 x i32> %5, <4 x i32> zeroinitializer + %res2 = add <4 x i32> %3, %4 + %res4 = add <4 x i32> %res2, %7 + ret <4 x i32> %res4 +} +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #0 + +define <8 x i32> @test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256: +; CHECK: # BB#0: +; CHECK-NEXT: vplzcntd %ymm0, %ymm2 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vplzcntd %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %x0, i1 false) + %2 = bitcast i8 %x2 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 + %4 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %x0, i1 false) + %res2 = add <8 x i32> %3, %4 + ret <8 x i32> %res2 +} +declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) #0 + +define <2 x i64> @test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128: +; CHECK: # BB#0: +; CHECK-NEXT: vplzcntq %xmm0, %xmm2 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vplzcntq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %x0, i1 false) + %2 = bitcast i8 %x2 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1 + %4 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %x0, i1 false) + %res2 = add <2 x i64> %3, %4 + ret <2 x i64> %res2 +} +declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) #0 + +define <4 x i64> @test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256: +; CHECK: # BB#0: +; CHECK-NEXT: vplzcntq %ymm0, %ymm2 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vplzcntq %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %x0, i1 false) + %2 = bitcast i8 %x2 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1 + %4 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %x0, i1 false) + %res2 = add <4 x i64> %3, %4 + ret <4 x i64> %res2 +} +declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) #0 + +declare <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_vpconflict_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpconflictd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpconflictd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpconflictd %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) + %res3 = call <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res2 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res2, %res3 + ret <4 x i32> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_vpconflict_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpconflictd %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpconflictd %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_vpconflict_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpconflictq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpconflictq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpconflictq %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpconflictq %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) { +; CHECK-LABEL: test_x86_vbroadcastmw_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k0 +; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ; + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16) + +define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) { +; CHECK-LABEL: test_x86_vbroadcastmw_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k0 +; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ; + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16) + +define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) { +; CHECK-LABEL: test_x86_broadcastmb_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k0 +; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ; + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8) + +define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) { +; CHECK-LABEL: test_x86_broadcastmb_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k0 +; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ; + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8) Index: test/CodeGen/X86/avx512dq-schedule-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512dq-schedule-intrinsics.ll @@ -0,0 +1,467 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK + +declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double>, <8 x i64>, i8, i32) + +define <8 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2qq {ru-sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtpd2qq {rn-sae}, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 0) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.512(<8 x double>, <8 x i64>, i8, i32) + +define <8 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2uqq {ru-sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtpd2uqq {rn-sae}, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 0) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.cvtps2qq.512(<8 x float>, <8 x i64>, i8, i32) + +define <8 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2qq {ru-sae}, %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtps2qq {rn-sae}, %ymm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.cvtps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 0) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.cvtps2uqq.512(<8 x float>, <8 x i64>, i8, i32) + +define <8 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2uqq {ru-sae}, %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtps2uqq {rn-sae}, %ymm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.cvtps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 0) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.cvtqq2pd.512(<8 x i64>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask_cvt_qq2pd_512(<8 x i64> %x0, <8 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2pd %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtqq2pd {rn-sae}, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.cvtqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 %x2, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.cvtqq2ps.512(<8 x i64>, <8 x float>, i8, i32) + +define <8 x float>@test_int_x86_avx512_mask_cvt_qq2ps_512(<8 x i64> %x0, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2ps %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtqq2ps {rn-sae}, %zmm0, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x float> @llvm.x86.avx512.mask.cvtqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 %x2, i32 4) + %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 -1, i32 0) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double>, <8 x i64>, i8, i32) + +define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttpd2qq %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvttpd2qq {sae}, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 8) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.512(<8 x double>, <8 x i64>, i8, i32) + +define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttpd2uqq %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvttpd2uqq {sae}, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 8) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.cvttps2qq.512(<8 x float>, <8 x i64>, i8, i32) + +define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttps2qq %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vcvttps2qq {sae}, %ymm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.cvttps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 8) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.cvttps2uqq.512(<8 x float>, <8 x i64>, i8, i32) + +define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttps2uqq %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vcvttps2uqq {sae}, %ymm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x i64> @llvm.x86.avx512.mask.cvttps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 8) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.cvtuqq2pd.512(<8 x i64>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_512(<8 x i64> %x0, <8 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2pd %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtuqq2pd {rn-sae}, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.cvtuqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 %x2, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtuqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.cvtuqq2ps.512(<8 x i64>, <8 x float>, i8, i32) + +define <8 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_512(<8 x i64> %x0, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2ps %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtuqq2ps {rn-sae}, %zmm0, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x float> @llvm.x86.avx512.mask.cvtuqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 %x2, i32 4) + %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtuqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 -1, i32 0) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double>, i32, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask_reduce_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreducepd $8, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vreducepd $4, {sae}, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double> %x0, i32 8, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double> %x0, i32 4, <8 x double> %x2, i8 -1, i32 8) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float>, i32, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask_reduce_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreduceps $44, {sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vreduceps $11, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float> %x0, i32 44, <16 x float> %x2, i16 %x3, i32 8) + %res1 = call <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 4) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask_range_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrangepd $8, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vrangepd $4, {sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %x0, <8 x double> %x1, i32 8, <8 x double> %x3, i8 %x4, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %x0, <8 x double> %x1, i32 4, <8 x double> %x3, i8 -1, i32 8) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask_range_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrangeps $88, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vrangeps $4, {sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %x0, <16 x float> %x1, i32 88, <16 x float> %x3, i16 %x4, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %x0, <16 x float> %x1, i32 4, <16 x float> %x3, i16 -1, i32 8) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32, i32) + +define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ss: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreducess $4, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vreducess $4, {sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 8) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32, i32) + +define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_range_ss: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4, i32 8) + %res1 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 8) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32, i32) + +define <2 x double>@test_int_x86_avx512_mask_reduce_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreducesd $4, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vreducesd $4, {sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 8) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32, i32) + +define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_range_sd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrangesd $4, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vrangesd $4, {sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 8) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double>, i32, i8) + +define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfpclasspd $2, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vfpclasspd $4, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addb %cl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 2, i8 %x1) + %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 4, i8 -1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} +declare i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float>, i32, i16) + +define i16@test_int_x86_avx512_mask_fpclass_ps_512(<16 x float> %x0, i16 %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfpclassps $4, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vfpclassps $4, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addl %ecx, %eax # sched: [1:0.25] +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 %x1) + %res1 = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 -1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + +declare i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double>, i32, i8) + +define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sd: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfpclasssd $2, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vfpclasssd $4, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addb %cl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 2, i8 %x1) + %res1 = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 4, i8 -1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float>, i32, i8) + +define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ss: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addb %cl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 %x1) + %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 -1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, <16 x float> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3) + %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %x3) + %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 -1) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res3, %res2 + ret <16 x float> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_512: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %x3) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +declare i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32>) + +define i16@test_int_x86_avx512_cvtd2mask_512(<16 x i32> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovd2m %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32> %x0) + ret i16 %res +} + +declare i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64>) + +define i8@test_int_x86_avx512_cvtq2mask_512(<8 x i64> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_512: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovq2m %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64> %x0) + ret i8 %res +} Index: test/CodeGen/X86/avx512dqvl-schedule-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512dqvl-schedule-intrinsics.ll @@ -0,0 +1,745 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK + +declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvtps2qq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtps2qq %xmm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_cvt_qq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) + %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_cvt_qq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) + %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) + %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128_zext: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero sched: [1:0.50] +; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) + %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> + %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> + %res4 = fadd <4 x float> %res1, %res3 + ret <4 x float> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) + %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvttps2qq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvttps2qq %xmm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) + %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) + %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) + %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128_zext: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero sched: [1:0.50] +; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) + %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> + %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> + %res4 = fadd <4 x float> %res1, %res3 + ret <4 x float> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) + %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double>, i32, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_reduce_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreducepd $4, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vreducepd $8, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double> %x0, i32 8, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double>, i32, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_reduce_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreducepd $4, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vreducepd $0, %ymm0, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double> %x0, i32 0, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float>, i32, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_reduce_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreduceps $4, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vreduceps $88, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float>, i32, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_reduce_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreduceps $11, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vreduceps $11, %ymm0, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double>, <2 x double>, i32, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_range_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrangepd $4, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vrangepd $8, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %x0, <2 x double> %x1, i32 4, <2 x double> %x3, i8 %x4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %x0, <2 x double> %x1, i32 8, <2 x double> %x3, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double>, <4 x double>, i32, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_range_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrangepd $4, %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vrangepd $88, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %x0, <4 x double> %x1, i32 4, <4 x double> %x3, i8 %x4) + %res1 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %x0, <4 x double> %x1, i32 88, <4 x double> %x3, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float>, <4 x float>, i32, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_range_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrangeps $4, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vrangeps $88, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %x0, <4 x float> %x1, i32 4, <4 x float> %x3, i8 %x4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %x0, <4 x float> %x1, i32 88, <4 x float> %x3, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float>, <8 x float>, i32, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_range_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrangeps $4, %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vrangeps $88, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %x0, <8 x float> %x1, i32 4, <8 x float> %x3, i8 %x4) + %res1 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %x0, <8 x float> %x1, i32 88, <8 x float> %x3, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8) + +define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfpclassps $2, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vfpclassps $4, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addb %cl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 2, i8 %x1) + %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 4, i8 -1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float>, i32, i8) + +define i8 @test_int_x86_avx512_mask_fpclass_ps_256(<8 x float> %x0, i8 %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfpclassps $2, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vfpclassps $4, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addb %cl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 2, i8 %x1) + %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 4, i8 -1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double>, i32, i8) + +define i8 @test_int_x86_avx512_mask_fpclass_pd_128(<2 x double> %x0, i8 %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfpclasspd $4, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vfpclasspd $2, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addb %cl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 4, i8 %x1) + %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 2, i8 -1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double>, i32, i8) + +define i8 @test_int_x86_avx512_mask_fpclass_pd_256(<4 x double> %x0, i8 %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfpclasspd $2, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vfpclasspd $4, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: addb %cl, %al # sched: [1:0.25] +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 2, i8 %x1) + %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 4, i8 -1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [4:0.50] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %x3) + %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 -1) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res3, %res2 + ret <8 x float> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3, i64 * %y_ptr) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_256: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = mem[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %y_64 = load i64, i64 * %y_ptr + %y_v2i64 = insertelement <2 x i64> undef, i64 %y_64, i32 0 + %y = bitcast <2 x i64> %y_v2i64 to <4 x i32> + %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %y, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %x3) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_128: +; CHECK: # BB#0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # sched: [1:0.50] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x3) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 -1) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32>) + +define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovd2m %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32>) + +define i8@test_int_x86_avx512_cvtd2mask_256(<8 x i32> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovd2m %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32> %x0) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64>) + +define i8@test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_128: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovq2m %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64>) + +define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovq2m %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: %AL %AL %EAX +; CHECK-NEXT: vzeroupper # sched: [4:1.00] +; CHECK-NEXT: retq # sched: [2:1.00] + %res = call i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64> %x0) + ret i8 %res +} Index: test/CodeGen/X86/avx512vpopcntdq-schedule-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512vpopcntdq-schedule-intrinsics.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK + +define <16 x i32> @test_mask_vpopcnt_d(<16 x i32> %a, i16 %mask, <16 x i32> %b) { +; CHECK-LABEL: test_mask_vpopcnt_d: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] sched: [5:0.50] +; CHECK-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] sched: [5:0.50] +; CHECK-NEXT: vpshufb %zmm3, %zmm4, %zmm3 +; CHECK-NEXT: vpsrlw $4, %zmm1, %zmm1 +; CHECK-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpshufb %zmm1, %zmm4, %zmm1 +; CHECK-NEXT: vpaddb %zmm3, %zmm1, %zmm1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] +; CHECK-NEXT: vpsadbw %zmm2, %zmm3, %zmm3 +; CHECK-NEXT: vpunpckldq {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] +; CHECK-NEXT: vpsadbw %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpackuswb %zmm3, %zmm1, %zmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %a + ret <16 x i32> %3 +} + +define <16 x i32> @test_maskz_vpopcnt_d(i16 %mask, <16 x i32> %a) { +; CHECK-LABEL: test_maskz_vpopcnt_d: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] sched: [5:0.50] +; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] sched: [5:0.50] +; CHECK-NEXT: vpshufb %zmm2, %zmm3, %zmm2 +; CHECK-NEXT: vpsrlw $4, %zmm0, %zmm0 +; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50] +; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; CHECK-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 +; CHECK-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; CHECK-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 +} + +define <8 x i64> @test_mask_vpopcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_vpopcnt_q: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] sched: [5:0.50] +; CHECK-NEXT: vpandq %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] sched: [5:0.50] +; CHECK-NEXT: vpshufb %zmm3, %zmm4, %zmm3 +; CHECK-NEXT: vpsrlw $4, %zmm0, %zmm0 +; CHECK-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpshufb %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50] +; CHECK-NEXT: vpsadbw %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %b + ret <8 x i64> %3 +} + +define <8 x i64> @test_maskz_vpopcnt_q(<8 x i64> %a, i8 %mask) { +; CHECK-LABEL: test_maskz_vpopcnt_q: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] sched: [5:0.50] +; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] sched: [5:0.50] +; CHECK-NEXT: vpshufb %zmm2, %zmm3, %zmm2 +; CHECK-NEXT: vpsrlw $4, %zmm0, %zmm0 +; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50] +; CHECK-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq # sched: [2:1.00] + %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 +} + +declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) +declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>)