Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -192,6 +192,7 @@ ValueType KVT = _vt; } +def v1i1_info : X86KVectorVTInfo; def v2i1_info : X86KVectorVTInfo; def v4i1_info : X86KVectorVTInfo; def v8i1_info : X86KVectorVTInfo; @@ -2935,6 +2936,44 @@ defm : operation_subvector_mask_lowering; + +multiclass vextract_for_mask_to_mask { + + def : Pat<(To.KVT (extract_subvector (From.KVT From.KRC:$src), (iPTR imm:$imm8))), + (To.KVT (COPY_TO_REGCLASS (!cast(InstrStr) From.KVT:$src, (i8 imm:$imm8)), To.KRC))>; + +} + +multiclass vextract_for_mask_to_mask_legal_b { + def : Pat<(To.KVT (extract_subvector (From.KVT From.KRC:$src), (iPTR imm:$imm8))), + (To.KVT (COPY_TO_REGCLASS (KSHIFTRBri (COPY_TO_REGCLASS From.KRC:$src, VK8), (i8 imm:$imm8)), To.KRC))>; +} + +defm : vextract_for_mask_to_mask_legal_b; +defm : vextract_for_mask_to_mask_legal_b; + +defm : vextract_for_mask_to_mask<"KSHIFTRBri", v8i1_info , v1i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRWri", v16i1_info, v1i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRDri", v32i1_info, v1i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRQri", v64i1_info, v1i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRBri", v8i1_info , v2i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRWri", v16i1_info, v2i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRDri", v32i1_info, v2i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRQri", v64i1_info, v2i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRBri", v8i1_info , v4i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRWri", v16i1_info, v4i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRDri", v32i1_info, v4i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRQri", v64i1_info, v4i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRWri", v16i1_info, v8i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRDri", v32i1_info, v8i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRQri", v64i1_info, v8i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRDri", v32i1_info, v16i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRQri", v64i1_info, v16i1_info>; +defm : vextract_for_mask_to_mask<"KSHIFTRQri", v64i1_info, v32i1_info>; + + def : Pat<(v2i1 (extract_subvector (v4i1 VK4:$src), (iPTR 2))), (v2i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), (i8 2)), @@ -2943,13 +2982,6 @@ (v4i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (i8 4)), VK4))>; -def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), - (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; -def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))), - (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>; -def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))), - (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>; - // Patterns for kmask shift multiclass mask_shift_lowering { Index: test/CodeGen/X86/avx512-extract-subvector-load-store.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -0,0 +1,819 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -O2 | FileCheck %s --check-prefix=AVX512 + + +define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { +; AVX512-LABEL: load_v8i1_broadcast_4_v2i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovb (%rdi), %k0 +; AVX512-NEXT: kshiftrb $4, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm2 +; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX512-NEXT: vpmovq2m %xmm2, %k1 +; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <8 x i1>, <8 x i1>* %a0 + %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32> + %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 + store <2 x double> %d2, <2 x double>* %a3 + ret void +} +define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { +; AVX512-LABEL: load_v8i1_broadcast_7_v2i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovb (%rdi), %k0 +; AVX512-NEXT: kshiftrb $6, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX512-NEXT: vpmovq2m %xmm2, %k1 +; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <8 x i1>, <8 x i1>* %a0 + %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32> + %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 + store <2 x double> %d2, <2 x double>* %a3 + ret void +} +define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { +; AVX512-LABEL: load_v16i1_broadcast_8_v2i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovw (%rdi), %k0 +; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm2 +; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX512-NEXT: vpmovq2m %xmm2, %k1 +; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <16 x i1>, <16 x i1>* %a0 + %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32> + %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 + store <2 x double> %d2, <2 x double>* %a3 + ret void +} +define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { +; AVX512-LABEL: load_v16i1_broadcast_8_v4i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovw (%rdi), %k0 +; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm2 +; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512-NEXT: vpmovd2m %xmm2, %k1 +; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <16 x i1>, <16 x i1>* %a0 + %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32> + %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 + store <4 x float> %d2, <4 x float>* %a3 + ret void +} +define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { +; AVX512-LABEL: load_v16i1_broadcast_15_v2i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovw (%rdi), %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX512-NEXT: vpmovq2m %xmm2, %k1 +; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <16 x i1>, <16 x i1>* %a0 + %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32> + %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 + store <2 x double> %d2, <2 x double>* %a3 + ret void +} +define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { +; AVX512-LABEL: load_v16i1_broadcast_15_v4i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovw (%rdi), %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vpmovd2m %xmm2, %k1 +; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <16 x i1>, <16 x i1>* %a0 + %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32> + %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 + store <4 x float> %d2, <4 x float>* %a3 + ret void +} +define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { +; AVX512-LABEL: load_v32i1_broadcast_16_v2i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm2 +; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX512-NEXT: vpmovq2m %xmm2, %k1 +; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32> + %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 + store <2 x double> %d2, <2 x double>* %a3 + ret void +} +define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { +; AVX512-LABEL: load_v32i1_broadcast_16_v4i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm2 +; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512-NEXT: vpmovd2m %xmm2, %k1 +; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32> + %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 + store <4 x float> %d2, <4 x float>* %a3 + ret void +} +define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { +; AVX512-LABEL: load_v32i1_broadcast_16_v8i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %zmm2 +; AVX512-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512-NEXT: vpmovq2m %zmm2, %k1 +; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32> + %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 + store <8 x float> %d2, <8 x float>* %a3 + ret void +} +define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { +; AVX512-LABEL: load_v32i1_broadcast_31_v2i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $30, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX512-NEXT: vpmovq2m %xmm2, %k1 +; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32> + %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 + store <2 x double> %d2, <2 x double>* %a3 + ret void +} +define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { +; AVX512-LABEL: load_v32i1_broadcast_31_v4i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $28, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vpmovd2m %xmm2, %k1 +; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32> + %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 + store <4 x float> %d2, <4 x float>* %a3 + ret void +} +define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { +; AVX512-LABEL: load_v32i1_broadcast_31_v8i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $24, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %zmm2 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7] +; AVX512-NEXT: vpermq %zmm2, %zmm3, %zmm2 +; AVX512-NEXT: vpmovq2m %zmm2, %k1 +; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32> + %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 + store <8 x float> %d2, <8 x float>* %a3 + ret void +} +define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { +; AVX512-LABEL: load_v64i1_broadcast_32_v2i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm2 +; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX512-NEXT: vpmovq2m %xmm2, %k1 +; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32> + %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 + store <2 x double> %d2, <2 x double>* %a3 + ret void +} +define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { +; AVX512-LABEL: load_v64i1_broadcast_32_v4i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm2 +; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512-NEXT: vpmovd2m %xmm2, %k1 +; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32> + %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 + store <4 x float> %d2, <4 x float>* %a3 + ret void +} +define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { +; AVX512-LABEL: load_v64i1_broadcast_32_v8i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %zmm2 +; AVX512-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512-NEXT: vpmovq2m %zmm2, %k1 +; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32> + %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 + store <8 x float> %d2, <8 x float>* %a3 + ret void +} +define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { +; AVX512-LABEL: load_v64i1_broadcast_32_v16i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %zmm2 +; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2 +; AVX512-NEXT: vpmovd2m %zmm2, %k1 +; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovaps %zmm1, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32> + %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2 + store <16 x float> %d2, <16 x float>* %a3 + ret void +} +define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { +; AVX512-LABEL: load_v64i1_broadcast_63_v2i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $62, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX512-NEXT: vpmovq2m %xmm2, %k1 +; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32> + %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 + store <2 x double> %d2, <2 x double>* %a3 + ret void +} +define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { +; AVX512-LABEL: load_v64i1_broadcast_63_v4i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $60, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vpmovd2m %xmm2, %k1 +; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32> + %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 + store <4 x float> %d2, <4 x float>* %a3 + ret void +} +define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { +; AVX512-LABEL: load_v64i1_broadcast_63_v8i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $56, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %zmm2 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7] +; AVX512-NEXT: vpermq %zmm2, %zmm3, %zmm2 +; AVX512-NEXT: vpmovq2m %zmm2, %k1 +; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32> + %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 + store <8 x float> %d2, <8 x float>* %a3 + ret void +} +define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { +; AVX512-LABEL: load_v64i1_broadcast_63_v16i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $48, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %zmm2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2 +; AVX512-NEXT: vpmovd2m %zmm2, %k1 +; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovaps %zmm1, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32> + %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2 + store <16 x float> %d2, <16 x float>* %a3 + ret void +} +define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovb (%rdi), %k0 +; AVX512-NEXT: kshiftrb $1, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <2 x i1>, <2 x i1>* %a0 + %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovb (%rdi), %k0 +; AVX512-NEXT: kshiftrb $1, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <3 x i1>, <3 x i1>* %a0 + %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovb (%rdi), %k0 +; AVX512-NEXT: kshiftrb $2, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <3 x i1>, <3 x i1>* %a0 + %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovb (%rdi), %k0 +; AVX512-NEXT: kshiftrb $2, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <4 x i1>, <4 x i1>* %a0 + %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovb (%rdi), %k0 +; AVX512-NEXT: kshiftrb $3, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <4 x i1>, <4 x i1>* %a0 + %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovb (%rdi), %k0 +; AVX512-NEXT: kshiftrb $4, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <8 x i1>, <8 x i1>* %a0 + %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { +; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovb (%rdi), %k0 +; AVX512-NEXT: kshiftrb $4, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm0 +; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512-NEXT: vpmovq2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <8 x i1>, <8 x i1>* %a0 + %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32> + store <2 x i1> %d1, <2 x i1>* %a1 + ret void +} +define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovb (%rdi), %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <8 x i1>, <8 x i1>* %a0 + %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { +; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovb (%rdi), %k0 +; AVX512-NEXT: kshiftrb $6, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmovq2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <8 x i1>, <8 x i1>* %a0 + %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32> + store <2 x i1> %d1, <2 x i1>* %a1 + ret void +} +define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovw (%rdi), %k0 +; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <16 x i1>, <16 x i1>* %a0 + %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { +; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovw (%rdi), %k0 +; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm0 +; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512-NEXT: vpmovq2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <16 x i1>, <16 x i1>* %a0 + %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32> + store <2 x i1> %d1, <2 x i1>* %a1 + ret void +} +define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { +; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovw (%rdi), %k0 +; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512-NEXT: vpmovd2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <16 x i1>, <16 x i1>* %a0 + %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32> + store <4 x i1> %d1, <4 x i1>* %a1 + ret void +} +define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovw (%rdi), %k0 +; AVX512-NEXT: kshiftrw $15, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <16 x i1>, <16 x i1>* %a0 + %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { +; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovw (%rdi), %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmovq2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <16 x i1>, <16 x i1>* %a0 + %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32> + store <2 x i1> %d1, <2 x i1>* %a1 + ret void +} +define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { +; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovw (%rdi), %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vpmovd2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <16 x i1>, <16 x i1>* %a0 + %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32> + store <4 x i1> %d1, <4 x i1>* %a1 + ret void +} +define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { +; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm0 +; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512-NEXT: vpmovq2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32> + store <2 x i1> %d1, <2 x i1>* %a1 + ret void +} +define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { +; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512-NEXT: vpmovd2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32> + store <4 x i1> %d1, <4 x i1>* %a1 + ret void +} +define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { +; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %zmm0 +; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512-NEXT: vpmovq2m %zmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32> + store <8 x i1> %d1, <8 x i1>* %a1 + ret void +} +define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $31, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { +; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $30, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmovq2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32> + store <2 x i1> %d1, <2 x i1>* %a1 + ret void +} +define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { +; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $28, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vpmovd2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32> + store <4 x i1> %d1, <4 x i1>* %a1 + ret void +} +define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { +; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd (%rdi), %k0 +; AVX512-NEXT: kshiftrd $24, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %zmm0 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7] +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpmovq2m %zmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <32 x i1>, <32 x i1>* %a0 + %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32> + store <8 x i1> %d1, <8 x i1>* %a1 + ret void +} +define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { +; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm0 +; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512-NEXT: vpmovq2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32> + store <2 x i1> %d1, <2 x i1>* %a1 + ret void +} +define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { +; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512-NEXT: vpmovd2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32> + store <4 x i1> %d1, <4 x i1>* %a1 + ret void +} +define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { +; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %zmm0 +; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512-NEXT: vpmovq2m %zmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32> + store <8 x i1> %d1, <8 x i1>* %a1 + ret void +} +define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { +; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %zmm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 +; AVX512-NEXT: vpmovd2m %zmm0, %k0 +; AVX512-NEXT: kmovw %k0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32> + store <16 x i1> %d1, <16 x i1>* %a1 + ret void +} +define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { +; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $63, %k0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32> + store <1 x i1> %d1, <1 x i1>* %a1 + ret void +} +define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { +; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $62, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmovq2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32> + store <2 x i1> %d1, <2 x i1>* %a1 + ret void +} +define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { +; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $60, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vpmovd2m %xmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32> + store <4 x i1> %d1, <4 x i1>* %a1 + ret void +} +define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { +; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $56, %k0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %zmm0 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7] +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpmovq2m %zmm0, %k0 +; AVX512-NEXT: kmovb %k0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32> + store <8 x i1> %d1, <8 x i1>* %a1 + ret void +} +define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { +; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq (%rdi), %k0 +; AVX512-NEXT: kshiftrq $48, %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %zmm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpmovd2m %zmm0, %k0 +; AVX512-NEXT: kmovw %k0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} + %d0 = load <64 x i1>, <64 x i1>* %a0 + %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32> + store <16 x i1> %d1, <16 x i1>* %a1 + ret void +} +