diff --git a/i/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/i/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2149,6 +2149,13 @@ if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { + // FIXME: Check if unaligned 64-byte accesses are slow. + if (Size >= 64 && Subtarget.hasAVX512() && + (Subtarget.getPreferVectorWidth() >= 512)) { + if (Subtarget.hasBWI() && Subtarget.hasVLX()) + return MVT::v64i8; + return MVT::v16i32; + } // FIXME: Check if unaligned 32-byte accesses are slow. if (Size >= 32 && Subtarget.hasAVX() && (Subtarget.getPreferVectorWidth() >= 256)) { diff --git a/i/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll --- a/i/llvm/test/CodeGen/X86/memcpy.ll +++ b/llvm/test/CodeGen/X86/memcpy.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=LINUX -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=DARWIN +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=DARWIN +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=LINUX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake | FileCheck %s -check-prefix=LINUX-SKL +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx | FileCheck %s -check-prefix=LINUX-SKX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl | FileCheck %s -check-prefix=LINUX-KNL declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind declare void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* nocapture, i8 addrspace(256)* nocapture, i64, i1) nounwind @@ -48,6 +51,30 @@ ; LINUX-NEXT: movl $64, %edx ; LINUX-NEXT: jmp memcpy # TAILCALL ; +; LINUX-SKL-LABEL: test3: +; LINUX-SKL: # %bb.0: # %entry +; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vzeroupper +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test3: +; LINUX-SKX: # %bb.0: # %entry +; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vzeroupper +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test3: +; LINUX-KNL: # %bb.0: # %entry +; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) +; LINUX-KNL-NEXT: retq +; ; DARWIN-LABEL: test3: ; DARWIN: ## %bb.0: ## %entry ; DARWIN-NEXT: movq 56(%rsi), %rax @@ -79,6 +106,30 @@ ; LINUX-NEXT: popq %rdx ; LINUX-NEXT: jmp memcpy # TAILCALL ; +; LINUX-SKL-LABEL: test3_minsize: +; LINUX-SKL: # %bb.0: +; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vzeroupper +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test3_minsize: +; LINUX-SKX: # %bb.0: +; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vzeroupper +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test3_minsize: +; LINUX-KNL: # %bb.0: +; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) +; LINUX-KNL-NEXT: retq +; ; DARWIN-LABEL: test3_minsize: ; DARWIN: ## %bb.0: ; DARWIN-NEXT: pushq $64 @@ -95,6 +146,30 @@ ; LINUX-NEXT: popq %rdx ; LINUX-NEXT: jmp memcpy # TAILCALL ; +; LINUX-SKL-LABEL: test3_minsize_optsize: +; LINUX-SKL: # %bb.0: +; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vzeroupper +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test3_minsize_optsize: +; LINUX-SKX: # %bb.0: +; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vzeroupper +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test3_minsize_optsize: +; LINUX-KNL: # %bb.0: +; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) +; LINUX-KNL-NEXT: retq +; ; DARWIN-LABEL: test3_minsize_optsize: ; DARWIN: ## %bb.0: ; DARWIN-NEXT: pushq $64 @@ -126,6 +201,30 @@ ; LINUX-NEXT: movq %rax, (%rdi) ; LINUX-NEXT: retq ; +; LINUX-SKL-LABEL: test4: +; LINUX-SKL: # %bb.0: # %entry +; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vzeroupper +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test4: +; LINUX-SKX: # %bb.0: # %entry +; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vzeroupper +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test4: +; LINUX-KNL: # %bb.0: # %entry +; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) +; LINUX-KNL-NEXT: retq +; ; DARWIN-LABEL: test4: ; DARWIN: ## %bb.0: ## %entry ; DARWIN-NEXT: movq 56(%rsi), %rax