Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3249,6 +3249,7 @@ multiclass avx512_load opc, string OpcodeStr, X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload, + bit NoRMPattern = 0, SDPatternOperator SelectOprr = vselect> { let hasSideEffects = 0 in { def rr : AVX512PI, EVEX, EVEX_KZ; - let canFoldAsLoad = 1, isReMaterializable = 1, + let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in def rm : AVX512PI, EVEX; let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in { @@ -3327,16 +3330,20 @@ multiclass avx512_load_vl opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, + bit NoRMPattern = 0, SDPatternOperator SelectOprr = vselect> { let Predicates = [prd] in defm Z : avx512_load, EVEX_V512; + masked_load_unaligned, NoRMPattern, + SelectOprr>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_load, EVEX_V256; + masked_load_unaligned, NoRMPattern, + SelectOprr>, EVEX_V256; defm Z128 : avx512_load, EVEX_V128; + masked_load_unaligned, NoRMPattern, + SelectOprr>, EVEX_V128; } } @@ -3416,13 +3423,13 @@ PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512, - null_frag>, + 0, null_frag>, avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512, "VMOVUPS">, PS, EVEX_CD8<32, CD8VF>; defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, - null_frag>, + 0, null_frag>, avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512, "VMOVUPD">, PD, VEX_W, EVEX_CD8<64, CD8VF>; @@ -3439,24 +3446,24 @@ HasAVX512, "VMOVDQA64">, PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>, +defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, 1>, avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI, "VMOVDQU8">, XD, EVEX_CD8<8, CD8VF>; -defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>, +defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, 1>, avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI, "VMOVDQU16">, XD, VEX_W, EVEX_CD8<16, CD8VF>; defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, - null_frag>, + 0, null_frag>, avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512, "VMOVDQU32">, XS, EVEX_CD8<32, CD8VF>; defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, - null_frag>, + 0, null_frag>, avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512, "VMOVDQU64">, XS, VEX_W, EVEX_CD8<64, CD8VF>; Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -710,7 +710,7 @@ ; ; AVX512BW-LABEL: avg_v64i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -1099,7 +1099,7 @@ ; ; AVX512BW-LABEL: avg_v32i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -1732,7 +1732,7 @@ ; ; AVX512BW-LABEL: avg_v64i8_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -2122,7 +2122,7 @@ ; ; AVX512BW-LABEL: avg_v32i16_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -2647,7 +2647,7 @@ ; ; AVX512BW-LABEL: avg_v64i8_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -2955,7 +2955,7 @@ ; ; AVX512BW-LABEL: avg_v32i16_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper Index: test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86.ll +++ test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -763,7 +763,7 @@ ; AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw_load_op0: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vmovdqu (%eax), %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x08] +; AVX512VL-NEXT: vmovdqa (%eax), %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x08] ; AVX512VL-NEXT: vpmaddubsw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x04,0xc0] ; AVX512VL-NEXT: retl ## encoding: [0xc3] %a0 = load <32 x i8>, <32 x i8>* %ptr Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -1295,7 +1295,7 @@ ; SKX-NEXT: vpmovm2w %k0, %zmm0 ; SKX-NEXT: kmovd %eax, %k0 ; SKX-NEXT: vpmovm2w %k0, %zmm1 -; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; SKX-NEXT: vpmovw2m %zmm2, %k0 ; SKX-NEXT: kmovd %k0, %eax Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -53,7 +53,7 @@ define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) { ; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_w_512: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512BW-NEXT: kmovd %edx, %k1 ; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} {z} @@ -64,7 +64,7 @@ ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm0 +; AVX512F-32-NEXT: vmovdqu64 (%ecx), %zmm0 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} ; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm1 {%k1} {z} @@ -82,7 +82,7 @@ define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) { ; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_b_512: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512BW-NEXT: kmovq %rdx, %k1 ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} {z} @@ -93,7 +93,7 @@ ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm0 +; AVX512F-32-NEXT: vmovdqu64 (%ecx), %zmm0 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vmovdqu8 (%eax), %zmm0 {%k1} ; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm1 {%k1} {z} Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1710,13 +1710,13 @@ define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi_const: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi_const: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] ; AVX512F-32-NEXT: vpsravw {{\.LCPI.*}}, %zmm0, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> , Index: test/CodeGen/X86/avx512bw-mov.ll =================================================================== --- test/CodeGen/X86/avx512bw-mov.ll +++ test/CodeGen/X86/avx512bw-mov.ll @@ -4,7 +4,7 @@ define <64 x i8> @test1(i8 * %addr) { ; CHECK-LABEL: test1: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: retq %vaddr = bitcast i8* %addr to <64 x i8>* %res = load <64 x i8>, <64 x i8>* %vaddr, align 1 @@ -52,7 +52,7 @@ define <32 x i16> @test5(i8 * %addr) { ; CHECK-LABEL: test5: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: retq %vaddr = bitcast i8* %addr to <32 x i16>* %res = load <32 x i16>, <32 x i16>* %vaddr, align 1 Index: test/CodeGen/X86/avx512bwvl-mov.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-mov.ll +++ test/CodeGen/X86/avx512bwvl-mov.ll @@ -4,7 +4,7 @@ define <32 x i8> @test_256_1(i8 * %addr) { ; CHECK-LABEL: test_256_1: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] +; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <32 x i8>* %res = load <32 x i8>, <32 x i8>* %vaddr, align 1 @@ -52,7 +52,7 @@ define <16 x i16> @test_256_5(i8 * %addr) { ; CHECK-LABEL: test_256_5: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] +; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i16>* %res = load <16 x i16>, <16 x i16>* %vaddr, align 1 @@ -100,7 +100,7 @@ define <16 x i8> @test_128_1(i8 * %addr) { ; CHECK-LABEL: test_128_1: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] +; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i8>* %res = load <16 x i8>, <16 x i8>* %vaddr, align 1 @@ -148,7 +148,7 @@ define <8 x i16> @test_128_5(i8 * %addr) { ; CHECK-LABEL: test_128_5: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] +; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i16>* %res = load <8 x i16>, <8 x i16>* %vaddr, align 1 Index: test/CodeGen/X86/nontemporal-loads.ll =================================================================== --- test/CodeGen/X86/nontemporal-loads.ll +++ test/CodeGen/X86/nontemporal-loads.ll @@ -1750,7 +1750,7 @@ ; ; AVX512BW-LABEL: test_unaligned_v32i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovups (%rdi), %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: test_unaligned_v32i16: @@ -1785,7 +1785,7 @@ ; ; AVX512BW-LABEL: test_unaligned_v64i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovups (%rdi), %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: test_unaligned_v64i8: Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -921,7 +921,7 @@ ; AVX512BW-LABEL: mul_v64i8c: ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 -; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -814,7 +814,7 @@ ; AVX512BW-NEXT: .p2align 4, 0x90 ; AVX512BW-NEXT: .LBB2_1: # %vector.body ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512BW-NEXT: vmovdqu8 a+1024(%rax), %zmm2 +; AVX512BW-NEXT: vmovdqa64 a+1024(%rax), %zmm2 ; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: addq $4, %rax Index: test/CodeGen/X86/shuffle-vs-trunc-128.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -58,7 +58,7 @@ ; ; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L @@ -113,7 +113,7 @@ ; ; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L Index: test/CodeGen/X86/shuffle-vs-trunc-256.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -61,7 +61,7 @@ ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -122,7 +122,7 @@ ; ; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq Index: test/CodeGen/X86/shuffle-vs-trunc-512.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -33,14 +33,14 @@ ; ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -75,14 +75,14 @@ ; ; AVX512BW-LABEL: trunc_v32i16_to_v32i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq Index: test/CodeGen/X86/sse42-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/sse42-intrinsics-x86.ll +++ test/CodeGen/X86/sse42-intrinsics-x86.ll @@ -52,7 +52,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] ; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; SKX-NEXT: vmovdqu (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x00] +; SKX-NEXT: vmovdqa (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00] ; SKX-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00] ; SKX-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00] ; SKX-NEXT: vpcmpestri $7, (%ecx), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0x01,0x07] @@ -298,7 +298,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] ; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] -; SKX-NEXT: vmovdqu (%ecx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x01] +; SKX-NEXT: vmovdqa (%ecx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x01] ; SKX-NEXT: vpcmpistri $7, (%eax), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0x00,0x07] ; SKX-NEXT: movl %ecx, %eax ## encoding: [0x89,0xc8] ; SKX-NEXT: retl ## encoding: [0xc3] Index: test/CodeGen/X86/ssse3-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/ssse3-intrinsics-x86.ll +++ test/CodeGen/X86/ssse3-intrinsics-x86.ll @@ -203,7 +203,7 @@ ; SKX-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0: ; SKX: ## BB#0: ; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; SKX-NEXT: vmovdqu (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x08] +; SKX-NEXT: vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08] ; SKX-NEXT: vpmaddubsw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x04,0xc0] ; SKX-NEXT: retl ## encoding: [0xc3] %a0 = load <16 x i8>, <16 x i8>* %ptr Index: test/CodeGen/X86/subvector-broadcast.ll =================================================================== --- test/CodeGen/X86/subvector-broadcast.ll +++ test/CodeGen/X86/subvector-broadcast.ll @@ -996,7 +996,7 @@ ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0 +; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax) ; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl @@ -1026,7 +1026,7 @@ ; ; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) ; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq @@ -1066,7 +1066,7 @@ ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0 +; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax) ; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl @@ -1096,7 +1096,7 @@ ; ; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) ; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq Index: test/CodeGen/X86/vector-bitreverse.ll =================================================================== --- test/CodeGen/X86/vector-bitreverse.ll +++ test/CodeGen/X86/vector-bitreverse.ll @@ -1435,11 +1435,11 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq @@ -1749,11 +1749,11 @@ ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq @@ -2069,11 +2069,11 @@ ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq @@ -2409,11 +2409,11 @@ ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-lzcnt-128.ll +++ test/CodeGen/X86/vector-lzcnt-128.ll @@ -200,7 +200,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 @@ -465,7 +465,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 @@ -717,7 +717,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 @@ -958,7 +958,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 @@ -1170,7 +1170,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 @@ -1374,7 +1374,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 @@ -1555,7 +1555,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1717,7 +1717,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1874,7 +1874,7 @@ ; ; AVX512VLBWDQ-LABEL: foldv8i16: ; AVX512VLBWDQ: # BB#0: -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16: @@ -1898,7 +1898,7 @@ ; ; AVX512VLBWDQ-LABEL: foldv8i16u: ; AVX512VLBWDQ: # BB#0: -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16u: @@ -1922,7 +1922,7 @@ ; ; AVX512VLBWDQ-LABEL: foldv16i8: ; AVX512VLBWDQ: # BB#0: -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8: @@ -1946,7 +1946,7 @@ ; ; AVX512VLBWDQ-LABEL: foldv16i8u: ; AVX512VLBWDQ: # BB#0: -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8u: Index: test/CodeGen/X86/vector-lzcnt-256.ll =================================================================== --- test/CodeGen/X86/vector-lzcnt-256.ll +++ test/CodeGen/X86/vector-lzcnt-256.ll @@ -129,7 +129,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 @@ -321,7 +321,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 @@ -493,7 +493,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 @@ -655,7 +655,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 @@ -797,7 +797,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 @@ -924,7 +924,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 @@ -1031,7 +1031,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1135,7 +1135,7 @@ ; AVX512VLBWDQ: # BB#0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1236,15 +1236,10 @@ } define <16 x i16> @foldv16i16() nounwind { -; NOBW-LABEL: foldv16i16: -; NOBW: # BB#0: -; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; NOBW-NEXT: retq -; -; AVX512VLBWDQ-LABEL: foldv16i16: -; AVX512VLBWDQ: # BB#0: -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; AVX512VLBWDQ-NEXT: retq +; X64-LABEL: foldv16i16: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16: ; X32-AVX: # BB#0: @@ -1255,15 +1250,10 @@ } define <16 x i16> @foldv16i16u() nounwind { -; NOBW-LABEL: foldv16i16u: -; NOBW: # BB#0: -; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; NOBW-NEXT: retq -; -; AVX512VLBWDQ-LABEL: foldv16i16u: -; AVX512VLBWDQ: # BB#0: -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; AVX512VLBWDQ-NEXT: retq +; X64-LABEL: foldv16i16u: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16u: ; X32-AVX: # BB#0: @@ -1274,15 +1264,10 @@ } define <32 x i8> @foldv32i8() nounwind { -; NOBW-LABEL: foldv32i8: -; NOBW: # BB#0: -; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; NOBW-NEXT: retq -; -; AVX512VLBWDQ-LABEL: foldv32i8: -; AVX512VLBWDQ: # BB#0: -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; AVX512VLBWDQ-NEXT: retq +; X64-LABEL: foldv32i8: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8: ; X32-AVX: # BB#0: @@ -1293,15 +1278,10 @@ } define <32 x i8> @foldv32i8u() nounwind { -; NOBW-LABEL: foldv32i8u: -; NOBW: # BB#0: -; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; NOBW-NEXT: retq -; -; AVX512VLBWDQ-LABEL: foldv32i8u: -; AVX512VLBWDQ: # BB#0: -; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; AVX512VLBWDQ-NEXT: retq +; X64-LABEL: foldv32i8u: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8u: ; X32-AVX: # BB#0: Index: test/CodeGen/X86/vector-lzcnt-512.ll =================================================================== --- test/CodeGen/X86/vector-lzcnt-512.ll +++ test/CodeGen/X86/vector-lzcnt-512.ll @@ -31,7 +31,7 @@ ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0 @@ -110,7 +110,7 @@ ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0 @@ -187,7 +187,7 @@ ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0 @@ -274,7 +274,7 @@ ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0 @@ -377,7 +377,7 @@ ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0 @@ -467,7 +467,7 @@ ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0 @@ -582,7 +582,7 @@ ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandnq %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0 @@ -684,7 +684,7 @@ ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandnq %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0 Index: test/CodeGen/X86/vector-popcnt-512.ll =================================================================== --- test/CodeGen/X86/vector-popcnt-512.ll +++ test/CodeGen/X86/vector-popcnt-512.ll @@ -32,7 +32,7 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -86,7 +86,7 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -137,7 +137,7 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -162,7 +162,7 @@ ; AVX512VPOPCNTDQ-BW: # BB#0: ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -199,7 +199,7 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -229,7 +229,7 @@ ; AVX512VPOPCNTDQ-BW: # BB#0: ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 Index: test/CodeGen/X86/vector-rotate-128.ll =================================================================== --- test/CodeGen/X86/vector-rotate-128.ll +++ test/CodeGen/X86/vector-rotate-128.ll @@ -476,7 +476,7 @@ ; ; AVX512VL-LABEL: var_rotate_v8i16: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm1, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsllvw %xmm1, %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlvw %xmm2, %xmm0, %xmm0 @@ -701,35 +701,20 @@ ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512BW-LABEL: var_rotate_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512BW-NEXT: vpsllvd %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512BW-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: var_rotate_v16i8: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm1 -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: var_rotate_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; AVX512-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; XOP-LABEL: var_rotate_v16i8: ; XOP: # BB#0: Index: test/CodeGen/X86/vector-rotate-256.ll =================================================================== --- test/CodeGen/X86/vector-rotate-256.ll +++ test/CodeGen/X86/vector-rotate-256.ll @@ -282,7 +282,7 @@ ; ; AVX512VL-LABEL: var_rotate_v16i16: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %ymm1, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllvw %ymm1, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0 @@ -407,33 +407,19 @@ ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: var_rotate_v32i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: var_rotate_v32i8: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %ymm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VL-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 -; AVX512VL-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VL-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512VL-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: var_rotate_v32i8: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512-NEXT: vpsubb %ymm1, %ymm2, %ymm2 +; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retq ; ; XOPAVX1-LABEL: var_rotate_v32i8: ; XOPAVX1: # BB#0: Index: test/CodeGen/X86/vector-rotate-512.ll =================================================================== --- test/CodeGen/X86/vector-rotate-512.ll +++ test/CodeGen/X86/vector-rotate-512.ll @@ -81,7 +81,7 @@ ; ; AVX512BW-LABEL: var_rotate_v32i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 @@ -90,7 +90,7 @@ ; ; AVX512VLBW-LABEL: var_rotate_v32i16: ; AVX512VLBW: # BB#0: -; AVX512VLBW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 @@ -226,7 +226,7 @@ ; ; AVX512BW-LABEL: var_rotate_v64i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 @@ -261,7 +261,7 @@ ; ; AVX512VLBW-LABEL: var_rotate_v64i8: ; AVX512VLBW: # BB#0: -; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 @@ -483,7 +483,7 @@ ; ; AVX512BW-LABEL: constant_rotate_v64i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 @@ -496,7 +496,7 @@ ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 @@ -516,7 +516,7 @@ ; ; AVX512VLBW-LABEL: constant_rotate_v64i8: ; AVX512VLBW: # BB#0: -; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 @@ -529,7 +529,7 @@ ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} -; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 Index: test/CodeGen/X86/vector-shift-ashr-512.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-512.ll +++ test/CodeGen/X86/vector-shift-ashr-512.ll @@ -356,7 +356,7 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsraw $4, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm3[8],zmm0[9],zmm3[9],zmm0[10],zmm3[10],zmm0[11],zmm3[11],zmm0[12],zmm3[12],zmm0[13],zmm3[13],zmm0[14],zmm3[14],zmm0[15],zmm3[15],zmm0[24],zmm3[24],zmm0[25],zmm3[25],zmm0[26],zmm3[26],zmm0[27],zmm3[27],zmm0[28],zmm3[28],zmm0[29],zmm3[29],zmm0[30],zmm3[30],zmm0[31],zmm3[31],zmm0[40],zmm3[40],zmm0[41],zmm3[41],zmm0[42],zmm3[42],zmm0[43],zmm3[43],zmm0[44],zmm3[44],zmm0[45],zmm3[45],zmm0[46],zmm3[46],zmm0[47],zmm3[47],zmm0[56],zmm3[56],zmm0[57],zmm3[57],zmm0[58],zmm3[58],zmm0[59],zmm3[59],zmm0[60],zmm3[60],zmm0[61],zmm3[61],zmm0[62],zmm3[62],zmm0[63],zmm3[63] ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} Index: test/CodeGen/X86/vector-shift-lshr-512.ll =================================================================== --- test/CodeGen/X86/vector-shift-lshr-512.ll +++ test/CodeGen/X86/vector-shift-lshr-512.ll @@ -269,7 +269,7 @@ ; ; AVX512BW-LABEL: constant_shift_v64i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 Index: test/CodeGen/X86/vector-shift-shl-512.ll =================================================================== --- test/CodeGen/X86/vector-shift-shl-512.ll +++ test/CodeGen/X86/vector-shift-shl-512.ll @@ -252,7 +252,7 @@ ; ; AVX512BW-LABEL: constant_shift_v64i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1384,21 +1384,13 @@ ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: PR12412: -; AVX1OR2: # BB#0: # %entry -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: PR12412: -; AVX512VL: # BB#0: # %entry -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq +; AVX-LABEL: PR12412: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq entry: %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> ret <16 x i8> %0 Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -168,7 +168,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -196,7 +196,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -223,7 +223,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -250,7 +250,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -276,7 +276,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -302,7 +302,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -328,7 +328,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -851,7 +851,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -878,7 +878,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -911,7 +911,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -942,7 +942,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -1131,7 +1131,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1157,7 +1157,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1410,7 +1410,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1437,7 +1437,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1465,7 +1465,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1492,7 +1492,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1516,7 +1516,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1705,7 +1705,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15] ; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 ; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 @@ -1802,7 +1802,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1825,7 +1825,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1902,7 +1902,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -2141,25 +2141,15 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2295,25 +2285,15 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2329,25 +2309,15 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2363,25 +2333,15 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2397,25 +2357,15 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2795,25 +2745,15 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2844,7 +2784,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -2877,7 +2817,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -2910,7 +2850,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -2943,7 +2883,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -2983,7 +2923,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3020,7 +2960,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3056,7 +2996,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3089,7 +3029,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -3123,7 +3063,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -3161,7 +3101,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3191,7 +3131,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u> +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u> ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3219,7 +3159,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u> +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u> ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3260,7 +3200,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u> +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u> ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3286,7 +3226,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -3314,7 +3254,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u> +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u> ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -3348,7 +3288,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3374,7 +3314,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3407,7 +3347,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3438,7 +3378,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -3469,7 +3409,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3575,7 +3515,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -3682,7 +3622,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3730,7 +3670,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -3778,7 +3718,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u> +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u> ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -3896,7 +3836,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> @@ -3933,7 +3873,7 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] ; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX512VL-NEXT: retq @@ -4023,7 +3963,7 @@ ; ; AVX512VL-LABEL: PR24935: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -53,7 +53,7 @@ ; ; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: ; SKX: ## BB#0: -; SKX-NEXT: vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31> +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31> ; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> @@ -86,7 +86,7 @@ ; ; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: ; SKX: ## BB#0: -; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56] +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56] ; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 ; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> Index: test/CodeGen/X86/vector-shuffle-512-v64.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v64.ll +++ test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -194,7 +194,7 @@ ; ; AVX512VBMI-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512VBMI: # BB#0: -; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> @@ -492,7 +492,7 @@ ; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512VBMI: # BB#0: ; AVX512VBMI-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm2 = [63,65,61,67,59,69,57,71,55,73,53,75,51,77,49,79,47,81,45,83,43,85,41,87,39,89,37,91,35,93,33,95,31,97,29,99,27,101,25,103,23,105,21,107,19,109,17,111,15,113,13,115,11,117,9,119,7,121,5,123,3,125,1,127] +; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,65,61,67,59,69,57,71,55,73,53,75,51,77,49,79,47,81,45,83,43,85,41,87,39,89,37,91,35,93,33,95,31,97,29,99,27,101,25,103,23,105,21,107,19,109,17,111,15,113,13,115,11,117,9,119,7,121,5,123,3,125,1,127] ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> @@ -549,7 +549,7 @@ ; ; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512VBMI: # BB#0: -; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm2 = [63,64,61,66,59,68,57,70,55,72,53,74,51,76,49,78,47,80,45,82,43,84,41,86,39,88,37,90,35,92,33,94,31,96,29,98,27,100,25,102,23,104,21,106,19,108,17,110,15,112,13,114,11,116,9,118,7,120,5,122,3,124,1,126] +; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,64,61,66,59,68,57,70,55,72,53,74,51,76,49,78,47,80,45,82,43,84,41,86,39,88,37,90,35,92,33,94,31,96,29,98,27,100,25,102,23,104,21,106,19,108,17,110,15,112,13,114,11,116,9,118,7,120,5,122,3,124,1,126] ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> Index: test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -547,18 +547,18 @@ ; X32-LABEL: combine_vpermt2var_32i16_identity_mask: ; X32: # BB#0: ; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; X32-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X32-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z} -; X32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] +; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] ; X32-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_32i16_identity_mask: ; X64: # BB#0: ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z} -; X64-NEXT: vmovdqu16 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] ; X64-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> , <32 x i16> %x0, <32 x i16> %x1, i32 %m) @@ -584,7 +584,7 @@ ; X32-LABEL: combine_pshufb_identity_mask: ; X32: # BB#0: ; X32-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; X32-NEXT: vmovdqu8 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 ; X32-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} @@ -595,7 +595,7 @@ ; X64-LABEL: combine_pshufb_identity_mask: ; X64: # BB#0: ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; X64-NEXT: vmovdqu8 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: kmovq %rdi, %k1 ; X64-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 ; X64-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} @@ -1009,13 +1009,13 @@ define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) { ; X32-LABEL: combine_vpermi2var_32i16_as_permw: ; X32: # BB#0: -; X32-NEXT: vmovdqu16 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] +; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] ; X32-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_32i16_as_permw: ; X64: # BB#0: -; X64-NEXT: vmovdqu16 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] ; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> , <32 x i16> %x1, i32 -1) @@ -1062,14 +1062,14 @@ define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) { ; X32-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw: ; X32: # BB#0: -; X32-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] +; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] ; X32-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; X32-NEXT: vmovdqa64 %zmm2, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw: ; X64: # BB#0: -; X64-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] ; X64-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ; X64-NEXT: retq Index: test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -22,18 +22,18 @@ ; X32-LABEL: combine_vpermt2var_16i16_identity_mask: ; X32: # BB#0: ; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X32-NEXT: vmovdqu {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X32-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z} -; X32-NEXT: vmovdqu {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X32-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16i16_identity_mask: ; X64: # BB#0: ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vmovdqu {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z} -; X64-NEXT: vmovdqu {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X64-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> , <16 x i16> %x0, <16 x i16> %x1, i16 %m) @@ -44,13 +44,13 @@ define <16 x i16> @combine_vpermi2var_16i16_as_permw(<16 x i16> %x0, <16 x i16> %x1) { ; X32-LABEL: combine_vpermi2var_16i16_as_permw: ; X32: # BB#0: -; X32-NEXT: vmovdqu {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7] +; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7] ; X32-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_16i16_as_permw: ; X64: # BB#0: -; X64-NEXT: vmovdqu {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7] +; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7] ; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> , <16 x i16> %x1, i16 -1) @@ -61,13 +61,13 @@ define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_vperm2(<16 x i16> %x0, <16 x i16> %x1) { ; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2: ; X32: # BB#0: -; X32-NEXT: vmovdqu {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] ; X32-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2: ; X64: # BB#0: -; X64-NEXT: vmovdqu {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] +; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] ; X64-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; X64-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> , <16 x i16> %x1, i16 -1) Index: test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -37,18 +37,18 @@ ; X32-LABEL: combine_vpermt2var_16i8_identity_mask: ; X32: # BB#0: ; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X32-NEXT: vmovdqu {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X32-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z} -; X32-NEXT: vmovdqu {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X32-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16i8_identity_mask: ; X64: # BB#0: ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vmovdqu {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z} -; X64-NEXT: vmovdqu {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X64-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> , <16 x i8> %x0, <16 x i8> %x1, i16 %m) @@ -73,13 +73,13 @@ define <32 x i8> @combine_vpermi2var_32i8_as_vpermb(<32 x i8> %x0, <32 x i8> %x1) { ; X32-LABEL: combine_vpermi2var_32i8_as_vpermb: ; X32: # BB#0: -; X32-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19] ; X32-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_32i8_as_vpermb: ; X64: # BB#0: -; X64-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19] ; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq %res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32> @@ -89,13 +89,13 @@ define <64 x i8> @combine_vpermi2var_64i8_as_vpermb(<64 x i8> %x0, <64 x i8> %x1) { ; X32-LABEL: combine_vpermi2var_64i8_as_vpermb: ; X32: # BB#0: -; X32-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] ; X32-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_64i8_as_vpermb: ; X64: # BB#0: -; X64-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] ; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq %res0 = shufflevector <64 x i8> %x0, <64 x i8> %x1, <64 x i32> @@ -106,17 +106,17 @@ define <16 x i8> @combine_vpermt2var_vpermi2var_16i8_as_vperm2(<16 x i8> %x0, <16 x i8> %x1) { ; X32-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2: ; X32: # BB#0: -; X32-NEXT: vmovdqu {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17] +; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17] ; X32-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 -; X32-NEXT: vmovdqu {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29] +; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29] ; X32-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2: ; X64: # BB#0: -; X64-NEXT: vmovdqu {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17] +; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17] ; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 -; X64-NEXT: vmovdqu {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29] +; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29] ; X64-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 ; X64-NEXT: retq %res0 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> , <16 x i8> %x1, i16 -1) @@ -126,13 +126,13 @@ define <32 x i8> @combine_vpermi2var_32i8_as_vperm2(<32 x i8> %x0, <32 x i8> %x1) { ; X32-LABEL: combine_vpermi2var_32i8_as_vperm2: ; X32: # BB#0: -; X32-NEXT: vmovdqu {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] ; X32-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_32i8_as_vperm2: ; X64: # BB#0: -; X64-NEXT: vmovdqu {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] ; X64-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 ; X64-NEXT: retq %res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32> @@ -142,13 +142,13 @@ define <64 x i8> @combine_vpermi2var_64i8_as_vperm2(<64 x i8> %x0, <64 x i8> %x1) { ; X32-LABEL: combine_vpermi2var_64i8_as_vperm2: ; X32: # BB#0: -; X32-NEXT: vmovdqu8 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] ; X32-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_64i8_as_vperm2: ; X64: # BB#0: -; X64-NEXT: vmovdqu8 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] ; X64-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; X64-NEXT: retq %res0 = shufflevector <64 x i8> %x0, <64 x i8> %x1, <64 x i32> Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -144,7 +144,7 @@ ; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0 ; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0 Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -1114,37 +1114,13 @@ ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc2x8i16_16i8: -; AVX512F: # BB#0: # %entry -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x8i16_16i8: -; AVX512VL: # BB#0: # %entry -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2x8i16_16i8: -; AVX512BW: # BB#0: # %entry -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2x8i16_16i8: -; AVX512BWVL: # BB#0: # %entry -; AVX512BWVL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc2x8i16_16i8: +; AVX512: # BB#0: # %entry +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: retq entry: %0 = trunc <8 x i16> %a to <8 x i8> %1 = trunc <8 x i16> %b to <8 x i8> Index: test/CodeGen/X86/vector-tzcnt-512.ll =================================================================== --- test/CodeGen/X86/vector-tzcnt-512.ll +++ test/CodeGen/X86/vector-tzcnt-512.ll @@ -42,7 +42,7 @@ ; AVX512CDBW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 @@ -60,7 +60,7 @@ ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 @@ -112,7 +112,7 @@ ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 @@ -180,7 +180,7 @@ ; AVX512CDBW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 @@ -202,7 +202,7 @@ ; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 @@ -258,7 +258,7 @@ ; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 @@ -326,7 +326,7 @@ ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -346,7 +346,7 @@ ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -420,7 +420,7 @@ ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -440,7 +440,7 @@ ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -508,7 +508,7 @@ ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -525,7 +525,7 @@ ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -598,7 +598,7 @@ ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -615,7 +615,7 @@ ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0