Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -3348,7 +3348,8 @@ } multiclass avx512_store opc, string OpcodeStr, X86VectorVTInfo _, - PatFrag st_frag, PatFrag mstore, string Name> { + PatFrag st_frag, PatFrag mstore, string Name, + bit NoMRPattern = 0> { let hasSideEffects = 0 in { def rr_REV : AVX512PI, EVEX, EVEX_KZ, FoldGenData; } + let hasSideEffects = 0, mayStore = 1 in def mr : AVX512PI, EVEX; + !if(NoMRPattern, [], + [(st_frag (_.VT _.RC:$src), addr:$dst)]), + _.ExeDomain>, EVEX; def mrk : AVX512PI opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, - string Name> { + string Name, bit NoMRPattern = 0> { let Predicates = [prd] in defm Z : avx512_store, EVEX_V512; + masked_store_unaligned, Name#Z, NoMRPattern>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store, EVEX_V256; + masked_store_unaligned, Name#Z256, + NoMRPattern>, EVEX_V256; defm Z128 : avx512_store, EVEX_V128; + masked_store_unaligned, Name#Z128, + NoMRPattern>, EVEX_V128; } } @@ -3448,12 +3454,12 @@ defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, 1>, avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, - HasBWI, "VMOVDQU8">, + HasBWI, "VMOVDQU8", 1>, XD, EVEX_CD8<8, CD8VF>; defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, 1>, avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, - HasBWI, "VMOVDQU16">, + HasBWI, "VMOVDQU16", 1>, XD, VEX_W, EVEX_CD8<16, CD8VF>; defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, @@ -3538,8 +3544,20 @@ sub_ymm)>; } -let Predicates = [HasVLX, NoBWI] in { - // 128-bit load/store without BWI. +let Predicates = [HasAVX512] in { + // 512-bit store. + def : Pat<(alignedstore512 (v32i16 VR512:$src), addr:$dst), + (VMOVDQA32Zmr addr:$dst, VR512:$src)>; + def : Pat<(alignedstore512 (v64i8 VR512:$src), addr:$dst), + (VMOVDQA32Zmr addr:$dst, VR512:$src)>; + def : Pat<(store (v32i16 VR512:$src), addr:$dst), + (VMOVDQU32Zmr addr:$dst, VR512:$src)>; + def : Pat<(store (v64i8 VR512:$src), addr:$dst), + (VMOVDQU32Zmr addr:$dst, VR512:$src)>; +} + +let Predicates = [HasVLX] in { + // 128-bit store. def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst), (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>; def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst), @@ -3549,7 +3567,7 @@ def : Pat<(store (v16i8 VR128X:$src), addr:$dst), (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>; - // 256-bit load/store without BWI. + // 256-bit store. def : Pat<(alignedstore256 (v16i16 VR256X:$src), addr:$dst), (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>; def : Pat<(alignedstore256 (v32i8 VR256X:$src), addr:$dst), @@ -3558,9 +3576,7 @@ (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v32i8 VR256X:$src), addr:$dst), (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>; -} -let Predicates = [HasVLX] in { // Special patterns for storing subvector extracts of lower 128-bits of 256. // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr def : Pat<(alignedstore (v2f64 (extract_subvector Index: llvm/trunk/test/CodeGen/X86/avg.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avg.ll +++ llvm/trunk/test/CodeGen/X86/avg.ll @@ -712,7 +712,7 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a @@ -1101,7 +1101,7 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %a @@ -1734,7 +1734,7 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a @@ -2124,7 +2124,7 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %a @@ -2649,7 +2649,7 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a @@ -2957,7 +2957,7 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %a Index: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll +++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll @@ -1839,21 +1839,13 @@ } define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) { -; KNL-LABEL: test_extractelement_variable_v8i16: -; KNL: ## BB#0: -; KNL-NEXT: ## kill: %EDI %EDI %RDI -; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; KNL-NEXT: andl $7, %edi -; KNL-NEXT: movzwl -24(%rsp,%rdi,2), %eax -; KNL-NEXT: retq -; -; SKX-LABEL: test_extractelement_variable_v8i16: -; SKX: ## BB#0: -; SKX-NEXT: ## kill: %EDI %EDI %RDI -; SKX-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp) -; SKX-NEXT: andl $7, %edi -; SKX-NEXT: movzwl -24(%rsp,%rdi,2), %eax -; SKX-NEXT: retq +; CHECK-LABEL: test_extractelement_variable_v8i16: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %EDI %EDI %RDI +; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: andl $7, %edi +; CHECK-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; CHECK-NEXT: retq %t2 = extractelement <8 x i16> %t1, i32 %index ret i16 %t2 } @@ -1892,7 +1884,7 @@ ; SKX-NEXT: andq $-32, %rsp ; SKX-NEXT: subq $64, %rsp ; SKX-NEXT: ## kill: %EDI %EDI %RDI -; SKX-NEXT: vmovdqu %ymm0, (%rsp) +; SKX-NEXT: vmovaps %ymm0, (%rsp) ; SKX-NEXT: andl $15, %edi ; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax ; SKX-NEXT: movq %rbp, %rsp @@ -1938,7 +1930,7 @@ ; SKX-NEXT: andq $-64, %rsp ; SKX-NEXT: subq $128, %rsp ; SKX-NEXT: ## kill: %EDI %EDI %RDI -; SKX-NEXT: vmovdqu16 %zmm0, (%rsp) +; SKX-NEXT: vmovaps %zmm0, (%rsp) ; SKX-NEXT: andl $31, %edi ; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax ; SKX-NEXT: movq %rbp, %rsp @@ -1950,23 +1942,14 @@ } define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) { -; KNL-LABEL: test_extractelement_variable_v16i8: -; KNL: ## BB#0: -; KNL-NEXT: ## kill: %EDI %EDI %RDI -; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; KNL-NEXT: andl $15, %edi -; KNL-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; KNL-NEXT: movb (%rdi,%rax), %al -; KNL-NEXT: retq -; -; SKX-LABEL: test_extractelement_variable_v16i8: -; SKX: ## BB#0: -; SKX-NEXT: ## kill: %EDI %EDI %RDI -; SKX-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp) -; SKX-NEXT: andl $15, %edi -; SKX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; SKX-NEXT: movb (%rdi,%rax), %al -; SKX-NEXT: retq +; CHECK-LABEL: test_extractelement_variable_v16i8: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %EDI %EDI %RDI +; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movb (%rdi,%rax), %al +; CHECK-NEXT: retq %t2 = extractelement <16 x i8> %t1, i32 %index ret i8 %t2 } @@ -2006,7 +1989,7 @@ ; SKX-NEXT: andq $-32, %rsp ; SKX-NEXT: subq $64, %rsp ; SKX-NEXT: ## kill: %EDI %EDI %RDI -; SKX-NEXT: vmovdqu %ymm0, (%rsp) +; SKX-NEXT: vmovaps %ymm0, (%rsp) ; SKX-NEXT: andl $31, %edi ; SKX-NEXT: movq %rsp, %rax ; SKX-NEXT: movb (%rdi,%rax), %al @@ -2055,7 +2038,7 @@ ; SKX-NEXT: andq $-64, %rsp ; SKX-NEXT: subq $128, %rsp ; SKX-NEXT: ## kill: %EDI %EDI %RDI -; SKX-NEXT: vmovdqu8 %zmm0, (%rsp) +; SKX-NEXT: vmovaps %zmm0, (%rsp) ; SKX-NEXT: andl $63, %edi ; SKX-NEXT: movq %rsp, %rax ; SKX-NEXT: movb (%rdi,%rax), %al @@ -2105,7 +2088,7 @@ ; SKX-NEXT: andq $-64, %rsp ; SKX-NEXT: subq $128, %rsp ; SKX-NEXT: addb %dil, %dil -; SKX-NEXT: vmovdqu8 %zmm0, (%rsp) +; SKX-NEXT: vmovaps %zmm0, (%rsp) ; SKX-NEXT: movzbl %dil, %eax ; SKX-NEXT: andl $63, %eax ; SKX-NEXT: movq %rsp, %rcx @@ -2330,7 +2313,7 @@ ; SKX-NEXT: ## kill: %EDI %EDI %RDI ; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ; SKX-NEXT: vpmovm2w %k0, %zmm0 -; SKX-NEXT: vmovdqu16 %zmm0, (%rsp) +; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) ; SKX-NEXT: andl $31, %edi ; SKX-NEXT: movzbl (%rsp,%rdi,2), %eax ; SKX-NEXT: andl $1, %eax Index: llvm/trunk/test/CodeGen/X86/avx512-insert-extract_i1.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-insert-extract_i1.ll +++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract_i1.ll @@ -19,7 +19,7 @@ ; SKX-NEXT: ## kill: %EDI %EDI %RDI ; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 -; SKX-NEXT: vmovdqu8 %zmm0, (%rsp) +; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) ; SKX-NEXT: andl $63, %edi ; SKX-NEXT: movq %rsp, %rax ; SKX-NEXT: movzbl (%rdi,%rax), %eax Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -9,7 +9,7 @@ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovq %rdx, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rsi) +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rsi) ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512: @@ -18,7 +18,7 @@ ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1} -; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%eax) +; AVX512F-32-NEXT: vmovdqu32 %zmm0, (%eax) ; AVX512F-32-NEXT: retl call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2) call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1) @@ -32,7 +32,7 @@ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rsi) +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rsi) ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512: @@ -41,7 +41,7 @@ ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%ecx) {%k1} -; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%eax) +; AVX512F-32-NEXT: vmovdqu32 %zmm0, (%eax) ; AVX512F-32-NEXT: retl call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2) call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1) Index: llvm/trunk/test/CodeGen/X86/avx512bw-mov.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-mov.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-mov.ll @@ -14,7 +14,7 @@ define void @test2(i8 * %addr, <64 x i8> %data) { ; CHECK-LABEL: test2: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu8 %zmm0, (%rdi) +; CHECK-NEXT: vmovups %zmm0, (%rdi) ; CHECK-NEXT: retq %vaddr = bitcast i8* %addr to <64 x i8>* store <64 x i8>%data, <64 x i8>* %vaddr, align 1 @@ -62,7 +62,7 @@ define void @test6(i8 * %addr, <32 x i16> %data) { ; CHECK-LABEL: test6: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu16 %zmm0, (%rdi) +; CHECK-NEXT: vmovups %zmm0, (%rdi) ; CHECK-NEXT: retq %vaddr = bitcast i8* %addr to <32 x i16>* store <32 x i16>%data, <32 x i16>* %vaddr, align 1 Index: llvm/trunk/test/CodeGen/X86/avx512bwvl-mov.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bwvl-mov.ll +++ llvm/trunk/test/CodeGen/X86/avx512bwvl-mov.ll @@ -14,7 +14,7 @@ define void @test_256_2(i8 * %addr, <32 x i8> %data) { ; CHECK-LABEL: test_256_2: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07] +; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <32 x i8>* store <32 x i8>%data, <32 x i8>* %vaddr, align 1 @@ -62,7 +62,7 @@ define void @test_256_6(i8 * %addr, <16 x i16> %data) { ; CHECK-LABEL: test_256_6: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07] +; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i16>* store <16 x i16>%data, <16 x i16>* %vaddr, align 1 @@ -110,7 +110,7 @@ define void @test_128_2(i8 * %addr, <16 x i8> %data) { ; CHECK-LABEL: test_128_2: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] +; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i8>* store <16 x i8>%data, <16 x i8>* %vaddr, align 1 @@ -158,7 +158,7 @@ define void @test_128_6(i8 * %addr, <8 x i16> %data) { ; CHECK-LABEL: test_128_6: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] +; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i16>* store <8 x i16>%data, <8 x i16>* %vaddr, align 1 Index: llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll +++ llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll @@ -894,69 +894,21 @@ } define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { -; X32-AVX-LABEL: test_broadcast_8i16_16i16_reuse: -; X32-AVX: ## BB#0: -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 -; X32-AVX-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX-NEXT: retl -; -; X32-AVX512F-LABEL: test_broadcast_8i16_16i16_reuse: -; X32-AVX512F: ## BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0 -; X32-AVX512F-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse: -; X32-AVX512BW: ## BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 -; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax) -; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse: -; X32-AVX512DQ: ## BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512DQ-NEXT: vmovaps (%ecx), %xmm0 -; X32-AVX512DQ-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512DQ-NEXT: retl -; -; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse: -; X64-AVX: ## BB#0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX-NEXT: retq -; -; X64-AVX512F-LABEL: test_broadcast_8i16_16i16_reuse: -; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX512F-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse: -; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) -; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512BW-NEXT: retq +; X32-LABEL: test_broadcast_8i16_16i16_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm0 +; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: retl ; -; X64-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse: -; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX512DQ-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512DQ-NEXT: retq +; X64-LABEL: test_broadcast_8i16_16i16_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vmovaps %xmm0, (%rsi) +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-NEXT: retq %1 = load <8 x i16>, <8 x i16> *%p0 store <8 x i16> %1, <8 x i16>* %p1 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> @@ -964,69 +916,21 @@ } define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { -; X32-AVX-LABEL: test_broadcast_16i8_32i8_reuse: -; X32-AVX: ## BB#0: -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 -; X32-AVX-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX-NEXT: retl -; -; X32-AVX512F-LABEL: test_broadcast_16i8_32i8_reuse: -; X32-AVX512F: ## BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0 -; X32-AVX512F-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse: -; X32-AVX512BW: ## BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 -; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax) -; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse: -; X32-AVX512DQ: ## BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512DQ-NEXT: vmovaps (%ecx), %xmm0 -; X32-AVX512DQ-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512DQ-NEXT: retl -; -; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse: -; X64-AVX: ## BB#0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX-NEXT: retq -; -; X64-AVX512F-LABEL: test_broadcast_16i8_32i8_reuse: -; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX512F-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse: -; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) -; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512BW-NEXT: retq +; X32-LABEL: test_broadcast_16i8_32i8_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm0 +; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: retl ; -; X64-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse: -; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX512DQ-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512DQ-NEXT: retq +; X64-LABEL: test_broadcast_16i8_32i8_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vmovaps %xmm0, (%rsi) +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p0 store <16 x i8> %1, <16 x i8>* %p1 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> Index: llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll +++ llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll @@ -361,8 +361,8 @@ ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu8 %zmm0, 64(%rdi) -; AVX512-NEXT: vmovdqu8 %zmm2, (%rdi) +; AVX512-NEXT: vmovdqa32 %zmm0, 64(%rdi) +; AVX512-NEXT: vmovdqa32 %zmm2, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> @@ -444,7 +444,7 @@ ; AVX512-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqu8 %zmm0, (%rdi) +; AVX512-NEXT: vmovdqa32 %zmm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32>