diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2149,6 +2149,11 @@ if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { + // FIXME: Check if unaligned 64-byte accesses are slow. + if (Size >= 64 && Subtarget.hasAVX512() && + (Subtarget.getPreferVectorWidth() >= 512)) { + return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; + } // FIXME: Check if unaligned 32-byte accesses are slow. if (Size >= 32 && Subtarget.hasAVX() && (Subtarget.getPreferVectorWidth() >= 256)) { diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll --- a/llvm/test/CodeGen/X86/memcpy.ll +++ b/llvm/test/CodeGen/X86/memcpy.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=DARWIN -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=LINUX -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake | FileCheck %s -check-prefix=LINUX-SKL -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx | FileCheck %s -check-prefix=LINUX-SKX -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl | FileCheck %s -check-prefix=LINUX-KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=DARWIN +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=LINUX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake | FileCheck %s -check-prefix=LINUX-SKL +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx | FileCheck %s -check-prefix=LINUX-SKX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl | FileCheck %s -check-prefix=LINUX-KNL ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512bw | FileCheck %s -check-prefix=LINUX-AVX512BW declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind @@ -124,10 +124,8 @@ ; ; LINUX-KNL-LABEL: test3: ; LINUX-KNL: # %bb.0: # %entry -; LINUX-KNL-NEXT: vmovups (%rsi), %ymm0 -; LINUX-KNL-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-KNL-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-KNL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) ; LINUX-KNL-NEXT: retq ; ; LINUX-AVX512BW-LABEL: test3: @@ -174,10 +172,8 @@ ; ; LINUX-KNL-LABEL: test3_minsize: ; LINUX-KNL: # %bb.0: -; LINUX-KNL-NEXT: vmovups (%rsi), %ymm0 -; LINUX-KNL-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-KNL-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-KNL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) ; LINUX-KNL-NEXT: retq ; ; LINUX-AVX512BW-LABEL: test3_minsize: @@ -223,10 +219,8 @@ ; ; LINUX-KNL-LABEL: test3_minsize_optsize: ; LINUX-KNL: # %bb.0: -; LINUX-KNL-NEXT: vmovups (%rsi), %ymm0 -; LINUX-KNL-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-KNL-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-KNL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) ; LINUX-KNL-NEXT: retq ; ; LINUX-AVX512BW-LABEL: test3_minsize_optsize: @@ -301,10 +295,8 @@ ; ; LINUX-KNL-LABEL: test4: ; LINUX-KNL: # %bb.0: # %entry -; LINUX-KNL-NEXT: vmovups (%rsi), %ymm0 -; LINUX-KNL-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-KNL-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-KNL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) ; LINUX-KNL-NEXT: retq ; ; LINUX-AVX512BW-LABEL: test4: diff --git a/llvm/test/CodeGen/X86/memset-nonzero.ll b/llvm/test/CodeGen/X86/memset-nonzero.ll --- a/llvm/test/CodeGen/X86/memset-nonzero.ll +++ b/llvm/test/CodeGen/X86/memset-nonzero.ll @@ -1,11 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST -; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512-ymm --check-prefix=AVX512F-ymm +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512-ymm --check-prefix=AVX512BW-ymm +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512-zmm --check-prefix=AVX512F-zmm +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512-zmm --check-prefix=AVX512BW-zmm ; https://llvm.org/bugs/show_bug.cgi?id=27100 @@ -82,13 +84,32 @@ ; SSE2FAST-NEXT: movups %xmm0, (%rdi) ; SSE2FAST-NEXT: retq ; -; AVX-LABEL: memset_64_nonzero_bytes: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: memset_64_nonzero_bytes: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovups %ymm0, 32(%rdi) +; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_64_nonzero_bytes: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_64_nonzero_bytes: +; AVX512: # %bb.0: +; AVX512-ymm-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512-ymm-NEXT: vmovups %ymm0, 32(%rdi) +; AVX512-ymm-NEXT: vmovups %ymm0, (%rdi) +; AVX512BW-zmm-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512F-zmm-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] +; AVX512-zmm-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT : retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1) ret void } @@ -128,15 +149,39 @@ ; SSE2FAST-NEXT: movups %xmm0, (%rdi) ; SSE2FAST-NEXT: retq ; -; AVX-LABEL: memset_128_nonzero_bytes: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX-NEXT: vmovups %ymm0, 96(%rdi) -; AVX-NEXT: vmovups %ymm0, 64(%rdi) -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: memset_128_nonzero_bytes: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovups %ymm0, 96(%rdi) +; AVX1-NEXT: vmovups %ymm0, 64(%rdi) +; AVX1-NEXT: vmovups %ymm0, 32(%rdi) +; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_128_nonzero_bytes: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovups %ymm0, 96(%rdi) +; AVX2-NEXT: vmovups %ymm0, 64(%rdi) +; AVX2-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_128_nonzero_bytes: +; AVX512: # %bb.0: +; AVX512-ymm-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512-ymm-NEXT: vmovups %ymm0, 96(%rdi) +; AVX512-ymm-NEXT: vmovups %ymm0, 64(%rdi) +; AVX512-ymm-NEXT: vmovups %ymm0, 32(%rdi) +; AVX512-ymm-NEXT: vmovups %ymm0, (%rdi) +; AVX512BW-zmm-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512F-zmm-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] +; AVX512-zmm-NEXT: vmovups %zmm0, 64(%rdi) +; AVX512-zmm-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1) ret void } @@ -174,19 +219,53 @@ ; SSE2FAST-NEXT: movups %xmm0, (%rdi) ; SSE2FAST-NEXT: retq ; -; AVX-LABEL: memset_256_nonzero_bytes: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX-NEXT: vmovups %ymm0, 224(%rdi) -; AVX-NEXT: vmovups %ymm0, 192(%rdi) -; AVX-NEXT: vmovups %ymm0, 160(%rdi) -; AVX-NEXT: vmovups %ymm0, 128(%rdi) -; AVX-NEXT: vmovups %ymm0, 96(%rdi) -; AVX-NEXT: vmovups %ymm0, 64(%rdi) -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: memset_256_nonzero_bytes: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovups %ymm0, 224(%rdi) +; AVX1-NEXT: vmovups %ymm0, 192(%rdi) +; AVX1-NEXT: vmovups %ymm0, 160(%rdi) +; AVX1-NEXT: vmovups %ymm0, 128(%rdi) +; AVX1-NEXT: vmovups %ymm0, 96(%rdi) +; AVX1-NEXT: vmovups %ymm0, 64(%rdi) +; AVX1-NEXT: vmovups %ymm0, 32(%rdi) +; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_256_nonzero_bytes: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovups %ymm0, 224(%rdi) +; AVX2-NEXT: vmovups %ymm0, 192(%rdi) +; AVX2-NEXT: vmovups %ymm0, 160(%rdi) +; AVX2-NEXT: vmovups %ymm0, 128(%rdi) +; AVX2-NEXT: vmovups %ymm0, 96(%rdi) +; AVX2-NEXT: vmovups %ymm0, 64(%rdi) +; AVX2-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_256_nonzero_bytes: +; AVX512: # %bb.0: +; AVX512-ymm-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512-ymm-NEXT: vmovups %ymm0, 224(%rdi) +; AVX512-ymm-NEXT: vmovups %ymm0, 192(%rdi) +; AVX512-ymm-NEXT: vmovups %ymm0, 160(%rdi) +; AVX512-ymm-NEXT: vmovups %ymm0, 128(%rdi) +; AVX512-ymm-NEXT: vmovups %ymm0, 96(%rdi) +; AVX512-ymm-NEXT: vmovups %ymm0, 64(%rdi) +; AVX512-ymm-NEXT: vmovups %ymm0, 32(%rdi) +; AVX512-ymm-NEXT: vmovups %ymm0, (%rdi) +; AVX512BW-zmm-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512F-zmm-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] +; AVX512-zmm-NEXT: vmovups %zmm0, 192(%rdi) +; AVX512-zmm-NEXT: vmovups %zmm0, 128(%rdi) +; AVX512-zmm-NEXT: vmovups %zmm0, 64(%rdi) +; AVX512-zmm-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1) ret void } @@ -341,13 +420,18 @@ ; AVX2-NEXT: retq ; ; AVX512-LABEL: memset_64_nonconst_bytes: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd %esi, %xmm0 -; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, 32(%rdi) -; AVX512-NEXT: vmovdqu %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512: # %bb.0: +; AVX512-ymm-NEXT: vmovd %esi, %xmm0 +; AVX512-ymm-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512BW-zmm-NEXT: vpbroadcastb %esi, %zmm0 +; AVX512F-zmm-NEXT: movzbl %sil, %eax +; AVX512F-zmm-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; AVX512F-zmm-NEXT: vpbroadcastd %eax, %zmm0 +; AVX512-ymm-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX512-ymm-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-zmm-NEXT: vmovdqu64 %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i1 false) ret void } @@ -419,12 +503,18 @@ ; ; AVX512-LABEL: memset_128_nonconst_bytes: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovd %esi, %xmm0 -; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, 96(%rdi) -; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi) -; AVX512-NEXT: vmovdqu %ymm0, 32(%rdi) -; AVX512-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-ymm-NEXT: vmovd %esi, %xmm0 +; AVX512-ymm-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512BW-zmm-NEXT: vpbroadcastb %esi, %zmm0 +; AVX512F-zmm-NEXT: movzbl %sil, %eax +; AVX512F-zmm-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; AVX512F-zmm-NEXT: vpbroadcastd %eax, %zmm0 +; AVX512-ymm-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX512-ymm-NEXT: vmovdqu %ymm0, 64(%rdi) +; AVX512-ymm-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX512-ymm-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-zmm-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512-zmm-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i1 false) @@ -495,16 +585,24 @@ ; ; AVX512-LABEL: memset_256_nonconst_bytes: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovd %esi, %xmm0 -; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, 224(%rdi) -; AVX512-NEXT: vmovdqu %ymm0, 192(%rdi) -; AVX512-NEXT: vmovdqu %ymm0, 160(%rdi) -; AVX512-NEXT: vmovdqu %ymm0, 128(%rdi) -; AVX512-NEXT: vmovdqu %ymm0, 96(%rdi) -; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi) -; AVX512-NEXT: vmovdqu %ymm0, 32(%rdi) -; AVX512-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-ymm-NEXT: vmovd %esi, %xmm0 +; AVX512-ymm-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512BW-zmm-NEXT: vpbroadcastb %esi, %zmm0 +; AVX512F-zmm-NEXT: movzbl %sil, %eax +; AVX512F-zmm-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; AVX512F-zmm-NEXT: vpbroadcastd %eax, %zmm0 +; AVX512-ymm-NEXT: vmovdqu %ymm0, 224(%rdi) +; AVX512-ymm-NEXT: vmovdqu %ymm0, 192(%rdi) +; AVX512-ymm-NEXT: vmovdqu %ymm0, 160(%rdi) +; AVX512-ymm-NEXT: vmovdqu %ymm0, 128(%rdi) +; AVX512-ymm-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX512-ymm-NEXT: vmovdqu %ymm0, 64(%rdi) +; AVX512-ymm-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX512-ymm-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-zmm-NEXT: vmovdqu64 %zmm0, 192(%rdi) +; AVX512-zmm-NEXT: vmovdqu64 %zmm0, 128(%rdi) +; AVX512-zmm-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512-zmm-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i1 false) diff --git a/llvm/test/CodeGen/X86/memset-zero.ll b/llvm/test/CodeGen/X86/memset-zero.ll --- a/llvm/test/CodeGen/X86/memset-zero.ll +++ b/llvm/test/CodeGen/X86/memset-zero.ll @@ -752,8 +752,7 @@ ; KNL-LABEL: memset_64: ; KNL: # %bb.0: # %entry ; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovups %ymm0, 32(%rdi) -; KNL-NEXT: vmovups %ymm0, (%rdi) +; KNL-NEXT: vmovups %zmm0, (%rdi) ; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 64, i1 false) @@ -819,8 +818,7 @@ ; KNL-LABEL: memset_64_align64: ; KNL: # %bb.0: # %entry ; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovaps %ymm0, 32(%rdi) -; KNL-NEXT: vmovaps %ymm0, (%rdi) +; KNL-NEXT: vmovaps %zmm0, (%rdi) ; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 false)