diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1637,6 +1637,14 @@ Group; def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">, Group; +def fuse_unaligned_vector_move : Flag<["-"], "fuse-unaligned-vector-move">, + Group, Flags<[CoreOption, HelpHidden]>, + HelpText<"Enable transforming aligned vector move instruction to " + "unaligned vector move.">; +def fno_use_unaligned_vector_move : Flag<["-"], "fno-use-unaligned-vector-move">, + Group, Flags<[CoreOption, HelpHidden]>, + HelpText<"Disable transforming aligned vector move instruction to " + "unaligned vector move.">; def fassociative_math : Flag<["-"], "fassociative-math">, Group; def fno_associative_math : Flag<["-"], "fno-associative-math">, Group; defm reciprocal_math : BoolFOption<"reciprocal-math", diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -2163,6 +2163,10 @@ CmdArgs.push_back("-mstack-alignment=4"); } + // Eable transforming aligned vector move instruction to unaligned vector + // move. + addX86UnalignedVectorMoveArgs(Args, CmdArgs, /*IsLTO=*/false); + // Handle -mtune. // Default to "generic" unless -march is present or targetting the PS4. diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -149,6 +149,10 @@ void addOpenMPDeviceRTL(const Driver &D, const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, StringRef BitcodeSuffix, const llvm::Triple &Triple); + +void addX86UnalignedVectorMoveArgs(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + bool IsLTO); } // end namespace tools } // end namespace driver } // end namespace clang diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -626,6 +626,7 @@ Args.MakeArgString(Twine("-plugin-opt=stats-file=") + StatsFile)); addX86AlignBranchArgs(D, Args, CmdArgs, /*IsLTO=*/true); + addX86UnalignedVectorMoveArgs(Args, CmdArgs, /*IsLTO=*/true); // Handle remark diagnostics on screen options: '-Rpass-*'. renderRpassOptions(Args, CmdArgs); @@ -1704,3 +1705,20 @@ << LibOmpTargetName << ArchPrefix; } } + +void tools::addX86UnalignedVectorMoveArgs(const ArgList &Args, + ArgStringList &CmdArgs, bool IsLTO) { + auto addArg = [&, IsLTO](const Twine &Arg) { + if (IsLTO) { + CmdArgs.push_back(Args.MakeArgString("-plugin-opt=" + Arg)); + } else { + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back(Args.MakeArgString(Arg)); + } + }; + + if (Args.hasArg(options::OPT_fuse_unaligned_vector_move)) + addArg(Twine("-x86-enable-unaligned-vector-move=true")); + else if (Args.hasArg(options::OPT_fno_use_unaligned_vector_move)) + addArg(Twine("-x86-enable-unaligned-vector-move=false")); +} diff --git a/clang/test/Driver/x86-unaligned-vector-move.c b/clang/test/Driver/x86-unaligned-vector-move.c new file mode 100644 --- /dev/null +++ b/clang/test/Driver/x86-unaligned-vector-move.c @@ -0,0 +1,10 @@ +// RUN: %clang -### -target x86_64-unknown-linux -fuse-unaligned-vector-move -c %s 2>&1 | FileCheck -check-prefix=UNALIGNED_VECTOR_MOVE %s +// RUN: %clang_cl -### --target=x86_64-pc-windows-msvc -fuse-unaligned-vector-move -c %s 2>&1 | FileCheck -check-prefix=UNALIGNED_VECTOR_MOVE %s +// UNALIGNED_VECTOR_MOVE: "-mllvm" "-x86-enable-unaligned-vector-move=true" +// RUN: %clang -### -target x86_64-unknown-linux -fuse-unaligned-vector-move -flto %s 2>&1 | FileCheck -check-prefix=LTO_MUNALIGNED_VECTOR_MOVE %s +// LTO_MUNALIGNED_VECTOR_MOVE: "-plugin-opt=-x86-enable-unaligned-vector-move=true" +// RUN: %clang -### -target x86_64-unknown-linux -fno-use-unaligned-vector-move -c %s 2>&1 | FileCheck -check-prefix=NO_UNALIGNED_VECTOR_MOVE %s +// RUN: %clang_cl -### --target=x86_64-pc-windows-msvc -fno-use-unaligned-vector-move -c %s 2>&1 | FileCheck -check-prefix=NO_UNALIGNED_VECTOR_MOVE %s +// NO_UNALIGNED_VECTOR_MOVE: "-mllvm" "-x86-enable-unaligned-vector-move=false" +// RUN: %clang -### -target x86_64-unknown-linux -fno-use-unaligned-vector-move -flto %s 2>&1 | FileCheck -check-prefix=LTO_NO_MUNALIGNED_VECTOR_MOVE %s +// LTO_NO_MUNALIGNED_VECTOR_MOVE: "-plugin-opt=-x86-enable-unaligned-vector-move=false" diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -48,6 +48,97 @@ using namespace llvm; +static cl::opt EnableX86UnalignedVecMove( + "x86-enable-unaligned-vector-move", cl::Hidden, + cl::desc("X86: Enable transforming aligned vector move instruction to " + "unaligned vector move."), + cl::init(false)); + +static const unsigned AlignedMovToUnalignedMovTable[][2] = { + // Replace vmovaps with vmovups. + // MOVAPSmr and MOVAPSrm aren't used when AVX is enabled. + { X86::VMOVAPSYmr, X86::VMOVUPSYmr }, + { X86::VMOVAPSYrm, X86::VMOVUPSYrm }, + { X86::VMOVAPSZ128mr, X86::VMOVUPSZ128mr }, + { X86::VMOVAPSZ128mr_NOVLX, X86::VMOVUPSZ128mr_NOVLX }, + { X86::VMOVAPSZ128mrk, X86::VMOVUPSZ128mrk }, + { X86::VMOVAPSZ128rm, X86::VMOVUPSZ128rm }, + { X86::VMOVAPSZ128rm_NOVLX, X86::VMOVUPSZ128rm_NOVLX }, + { X86::VMOVAPSZ128rmk, X86::VMOVUPSZ128rmk }, + { X86::VMOVAPSZ128rmkz, X86::VMOVUPSZ128rmkz }, + { X86::VMOVAPSZ256mr, X86::VMOVUPSZ256mr }, + { X86::VMOVAPSZ256mr_NOVLX, X86::VMOVUPSZ256mr_NOVLX }, + { X86::VMOVAPSZ256mrk, X86::VMOVUPSZ256mrk }, + { X86::VMOVAPSZ256rm, X86::VMOVUPSZ256rm }, + { X86::VMOVAPSZ256rm_NOVLX, X86::VMOVUPSZ256rm_NOVLX }, + { X86::VMOVAPSZ256rmk, X86::VMOVUPSZ256rmk }, + { X86::VMOVAPSZ256rmkz, X86::VMOVUPSZ256rmkz }, + { X86::VMOVAPSZmr, X86::VMOVUPSZmr }, + { X86::VMOVAPSZmrk, X86::VMOVUPSZmrk }, + { X86::VMOVAPSZrm, X86::VMOVUPSZrm }, + { X86::VMOVAPSZrmk, X86::VMOVUPSZrmk }, + { X86::VMOVAPSZrmkz, X86::VMOVUPSZrmkz }, + { X86::VMOVAPSmr, X86::VMOVUPSmr }, + { X86::VMOVAPSrm, X86::VMOVUPSrm }, + // Replace vmovapd with vmovupd. + // MOVAPDmr and MOVAPDrm aren't used when AVX is enabled. + { X86::VMOVAPDYmr, X86::VMOVUPDYmr }, + { X86::VMOVAPDYrm, X86::VMOVUPDYrm }, + { X86::VMOVAPDZ128mr, X86::VMOVUPDZ128mr }, + { X86::VMOVAPDZ128mrk, X86::VMOVUPDZ128mrk }, + { X86::VMOVAPDZ128rm, X86::VMOVUPDZ128rm }, + { X86::VMOVAPDZ128rmk, X86::VMOVUPDZ128rmk }, + { X86::VMOVAPDZ128rmkz, X86::VMOVUPDZ128rmkz }, + { X86::VMOVAPDZ256mr, X86::VMOVUPDZ256mr }, + { X86::VMOVAPDZ256mrk, X86::VMOVUPDZ256mrk }, + { X86::VMOVAPDZ256rm, X86::VMOVUPDZ256rm }, + { X86::VMOVAPDZ256rmk, X86::VMOVUPDZ256rmk }, + { X86::VMOVAPDZ256rmkz, X86::VMOVUPDZ256rmkz }, + { X86::VMOVAPDZmr, X86::VMOVUPDZmr }, + { X86::VMOVAPDZmrk, X86::VMOVUPDZmrk }, + { X86::VMOVAPDZrm, X86::VMOVUPDZrm }, + { X86::VMOVAPDZrmk, X86::VMOVUPDZrmk }, + { X86::VMOVAPDZrmkz, X86::VMOVUPDZrmkz }, + { X86::VMOVAPDmr, X86::VMOVUPDmr }, + { X86::VMOVAPDrm, X86::VMOVUPDrm }, + // Replace vmovdqa with vmovdqu. + // MOVDQAmr and MOVDQArm aren't used when AVX is enabled. + { X86::VMOVDQA32Z128mr, X86::VMOVDQU32Z128mr }, + { X86::VMOVDQA32Z128mrk, X86::VMOVDQU32Z128mrk }, + { X86::VMOVDQA32Z128rm, X86::VMOVDQU32Z128rm }, + { X86::VMOVDQA32Z128rmk, X86::VMOVDQU32Z128rmk }, + { X86::VMOVDQA32Z128rmkz, X86::VMOVDQU32Z128rmkz }, + { X86::VMOVDQA32Z256mr, X86::VMOVDQU32Z256mr }, + { X86::VMOVDQA32Z256mrk, X86::VMOVDQU32Z256mrk }, + { X86::VMOVDQA32Z256rm, X86::VMOVDQU32Z256rm }, + { X86::VMOVDQA32Z256rmk, X86::VMOVDQU32Z256rmk }, + { X86::VMOVDQA32Z256rmkz, X86::VMOVDQU32Z256rmkz }, + { X86::VMOVDQA32Zmr, X86::VMOVDQU32Zmr }, + { X86::VMOVDQA32Zmrk, X86::VMOVDQU32Zmrk }, + { X86::VMOVDQA32Zrm, X86::VMOVDQU32Zrm }, + { X86::VMOVDQA32Zrmk, X86::VMOVDQU32Zrmk }, + { X86::VMOVDQA32Zrmkz, X86::VMOVDQU32Zrmkz }, + { X86::VMOVDQA64Z128mr, X86::VMOVDQU64Z128mr }, + { X86::VMOVDQA64Z128mrk, X86::VMOVDQU64Z128mrk }, + { X86::VMOVDQA64Z128rm, X86::VMOVDQU64Z128rm }, + { X86::VMOVDQA64Z128rmk, X86::VMOVDQU64Z128rmk }, + { X86::VMOVDQA64Z128rmkz, X86::VMOVDQU64Z128rmkz }, + { X86::VMOVDQA64Z256mr, X86::VMOVDQU64Z256mr }, + { X86::VMOVDQA64Z256mrk, X86::VMOVDQU64Z256mrk }, + { X86::VMOVDQA64Z256rm, X86::VMOVDQU64Z256rm }, + { X86::VMOVDQA64Z256rmk, X86::VMOVDQU64Z256rmk }, + { X86::VMOVDQA64Z256rmkz, X86::VMOVDQU64Z256rmkz }, + { X86::VMOVDQA64Zmr, X86::VMOVDQU64Zmr }, + { X86::VMOVDQA64Zmrk, X86::VMOVDQU64Zmrk }, + { X86::VMOVDQA64Zrm, X86::VMOVDQU64Zrm }, + { X86::VMOVDQA64Zrmk, X86::VMOVDQU64Zrmk }, + { X86::VMOVDQA64Zrmkz, X86::VMOVDQU64Zrmkz }, + { X86::VMOVDQAYmr, X86::VMOVDQUYmr }, + { X86::VMOVDQAYrm, X86::VMOVDQUYrm }, + { X86::VMOVDQAmr, X86::VMOVDQUmr }, + { X86::VMOVDQArm, X86::VMOVDQUrm }, +}; + namespace { /// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst. @@ -2612,6 +2703,18 @@ MCInst TmpInst; MCInstLowering.Lower(MI, TmpInst); + // Replace aligned vector move with unaligned vector move when the option is + // enabled. + if (EnableX86UnalignedVecMove) { + for (const auto Pair : AlignedMovToUnalignedMovTable) { + if (Pair[0] == TmpInst.getOpcode()) { + TmpInst.setOpcode(Pair[1]); + OutStreamer->AddComment("AlignMOV convert to UnAlignMOV ", false); + break; + } + } + } + // Stackmap shadows cannot include branch targets, so we can count the bytes // in a call towards the shadow, but must ensure that the no thread returns // in to the stackmap shadow. The only way to achieve this is if the call diff --git a/llvm/test/CodeGen/X86/avx-unaligned-load-store.ll b/llvm/test/CodeGen/X86/avx-unaligned-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx-unaligned-load-store.ll @@ -0,0 +1,441 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-enable-unaligned-vector-move -mattr=sse4.2 | FileCheck %s -check-prefix=CHECK_SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-enable-unaligned-vector-move -mattr=avx | FileCheck %s -check-prefix=CHECK_AVX +; RUN: llc < %s -mtriple=i686-unknown-unknown -x86-enable-unaligned-vector-move -mattr=sse4.2 | FileCheck %s -check-prefix=CHECK_SSE32 +; RUN: llc < %s -mtriple=i686-unknown-unknown -x86-enable-unaligned-vector-move -mattr=avx | FileCheck %s -check-prefix=CHECK_AVX32 + +define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind { +; CHECK_SSE-LABEL: test_256_load: +; CHECK_SSE: # %bb.0: # %entry +; CHECK_SSE-NEXT: pushq %r15 +; CHECK_SSE-NEXT: pushq %r14 +; CHECK_SSE-NEXT: pushq %rbx +; CHECK_SSE-NEXT: subq $96, %rsp +; CHECK_SSE-NEXT: movq %rdx, %r14 +; CHECK_SSE-NEXT: movq %rsi, %r15 +; CHECK_SSE-NEXT: movq %rdi, %rbx +; CHECK_SSE-NEXT: movaps (%rdx), %xmm4 +; CHECK_SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK_SSE-NEXT: movaps 16(%rdx), %xmm5 +; CHECK_SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK_SSE-NEXT: movaps (%rsi), %xmm2 +; CHECK_SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK_SSE-NEXT: movaps 16(%rsi), %xmm3 +; CHECK_SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK_SSE-NEXT: movaps (%rdi), %xmm0 +; CHECK_SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK_SSE-NEXT: movaps 16(%rdi), %xmm1 +; CHECK_SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; CHECK_SSE-NEXT: callq dummy@PLT +; CHECK_SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, (%rbx) +; CHECK_SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, 16(%rbx) +; CHECK_SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, (%r15) +; CHECK_SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, 16(%r15) +; CHECK_SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, (%r14) +; CHECK_SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, 16(%r14) +; CHECK_SSE-NEXT: addq $96, %rsp +; CHECK_SSE-NEXT: popq %rbx +; CHECK_SSE-NEXT: popq %r14 +; CHECK_SSE-NEXT: popq %r15 +; CHECK_SSE-NEXT: retq +; +; CHECK_AVX-LABEL: test_256_load: +; CHECK_AVX: # %bb.0: # %entry +; CHECK_AVX-NEXT: pushq %r15 +; CHECK_AVX-NEXT: pushq %r14 +; CHECK_AVX-NEXT: pushq %rbx +; CHECK_AVX-NEXT: subq $96, %rsp +; CHECK_AVX-NEXT: movq %rdx, %r14 +; CHECK_AVX-NEXT: movq %rsi, %r15 +; CHECK_AVX-NEXT: movq %rdi, %rbx +; CHECK_AVX-NEXT: vmovups (%rdi), %ymm0 # AlignMOV convert to UnAlignMOV +; CHECK_AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK_AVX-NEXT: vmovups (%rsi), %ymm1 # AlignMOV convert to UnAlignMOV +; CHECK_AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK_AVX-NEXT: vmovups (%rdx), %ymm2 # AlignMOV convert to UnAlignMOV +; CHECK_AVX-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; CHECK_AVX-NEXT: callq dummy@PLT +; CHECK_AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK_AVX-NEXT: vmovups %ymm0, (%rbx) # AlignMOV convert to UnAlignMOV +; CHECK_AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK_AVX-NEXT: vmovups %ymm0, (%r15) # AlignMOV convert to UnAlignMOV +; CHECK_AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; CHECK_AVX-NEXT: vmovups %ymm0, (%r14) # AlignMOV convert to UnAlignMOV +; CHECK_AVX-NEXT: addq $96, %rsp +; CHECK_AVX-NEXT: popq %rbx +; CHECK_AVX-NEXT: popq %r14 +; CHECK_AVX-NEXT: popq %r15 +; CHECK_AVX-NEXT: vzeroupper +; CHECK_AVX-NEXT: retq +; +; CHECK_SSE32-LABEL: test_256_load: +; CHECK_SSE32: # %bb.0: # %entry +; CHECK_SSE32-NEXT: pushl %ebp +; CHECK_SSE32-NEXT: movl %esp, %ebp +; CHECK_SSE32-NEXT: pushl %ebx +; CHECK_SSE32-NEXT: pushl %edi +; CHECK_SSE32-NEXT: pushl %esi +; CHECK_SSE32-NEXT: andl $-16, %esp +; CHECK_SSE32-NEXT: subl $160, %esp +; CHECK_SSE32-NEXT: movl 16(%ebp), %esi +; CHECK_SSE32-NEXT: movl 12(%ebp), %edi +; CHECK_SSE32-NEXT: movl 8(%ebp), %ebx +; CHECK_SSE32-NEXT: movaps (%ebx), %xmm0 +; CHECK_SSE32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK_SSE32-NEXT: movaps 16(%ebx), %xmm1 +; CHECK_SSE32-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK_SSE32-NEXT: movaps (%edi), %xmm2 +; CHECK_SSE32-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK_SSE32-NEXT: movaps 16(%edi), %xmm3 +; CHECK_SSE32-NEXT: movaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK_SSE32-NEXT: movaps (%esi), %xmm4 +; CHECK_SSE32-NEXT: movaps %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK_SSE32-NEXT: movaps 16(%esi), %xmm5 +; CHECK_SSE32-NEXT: movaps %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK_SSE32-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; CHECK_SSE32-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; CHECK_SSE32-NEXT: movaps %xmm3, (%esp) +; CHECK_SSE32-NEXT: calll dummy@PLT +; CHECK_SSE32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE32-NEXT: movaps %xmm0, (%ebx) +; CHECK_SSE32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE32-NEXT: movaps %xmm0, 16(%ebx) +; CHECK_SSE32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE32-NEXT: movaps %xmm0, (%edi) +; CHECK_SSE32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE32-NEXT: movaps %xmm0, 16(%edi) +; CHECK_SSE32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE32-NEXT: movaps %xmm0, (%esi) +; CHECK_SSE32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE32-NEXT: movaps %xmm0, 16(%esi) +; CHECK_SSE32-NEXT: leal -12(%ebp), %esp +; CHECK_SSE32-NEXT: popl %esi +; CHECK_SSE32-NEXT: popl %edi +; CHECK_SSE32-NEXT: popl %ebx +; CHECK_SSE32-NEXT: popl %ebp +; CHECK_SSE32-NEXT: retl +; +; CHECK_AVX32-LABEL: test_256_load: +; CHECK_AVX32: # %bb.0: # %entry +; CHECK_AVX32-NEXT: pushl %ebx +; CHECK_AVX32-NEXT: pushl %edi +; CHECK_AVX32-NEXT: pushl %esi +; CHECK_AVX32-NEXT: subl $112, %esp +; CHECK_AVX32-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK_AVX32-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK_AVX32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK_AVX32-NEXT: vmovups (%ebx), %ymm0 # AlignMOV convert to UnAlignMOV +; CHECK_AVX32-NEXT: vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill +; CHECK_AVX32-NEXT: vmovups (%edi), %ymm1 # AlignMOV convert to UnAlignMOV +; CHECK_AVX32-NEXT: vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill +; CHECK_AVX32-NEXT: vmovups (%esi), %ymm2 # AlignMOV convert to UnAlignMOV +; CHECK_AVX32-NEXT: vmovups %ymm2, (%esp) # 32-byte Spill +; CHECK_AVX32-NEXT: calll dummy@PLT +; CHECK_AVX32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK_AVX32-NEXT: vmovups %ymm0, (%ebx) # AlignMOV convert to UnAlignMOV +; CHECK_AVX32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK_AVX32-NEXT: vmovups %ymm0, (%edi) # AlignMOV convert to UnAlignMOV +; CHECK_AVX32-NEXT: vmovups (%esp), %ymm0 # 32-byte Reload +; CHECK_AVX32-NEXT: vmovups %ymm0, (%esi) # AlignMOV convert to UnAlignMOV +; CHECK_AVX32-NEXT: addl $112, %esp +; CHECK_AVX32-NEXT: popl %esi +; CHECK_AVX32-NEXT: popl %edi +; CHECK_AVX32-NEXT: popl %ebx +; CHECK_AVX32-NEXT: vzeroupper +; CHECK_AVX32-NEXT: retl +entry: + %0 = bitcast double* %d to <4 x double>* + %tmp1.i = load <4 x double>, <4 x double>* %0, align 32 + %1 = bitcast float* %f to <8 x float>* + %tmp1.i17 = load <8 x float>, <8 x float>* %1, align 32 + %tmp1.i16 = load <4 x i64>, <4 x i64>* %i, align 32 + tail call void @dummy(<4 x double> %tmp1.i, <8 x float> %tmp1.i17, <4 x i64> %tmp1.i16) nounwind + store <4 x double> %tmp1.i, <4 x double>* %0, align 32 + store <8 x float> %tmp1.i17, <8 x float>* %1, align 32 + store <4 x i64> %tmp1.i16, <4 x i64>* %i, align 32 + ret void +} + +declare void @dummy(<4 x double>, <8 x float>, <4 x i64>) + +define void @storev16i16(<16 x i16> %a) nounwind { +; CHECK_SSE-LABEL: storev16i16: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps %xmm1, (%rax) +; CHECK_SSE-NEXT: movaps %xmm0, (%rax) +; +; CHECK_AVX-LABEL: storev16i16: +; CHECK_AVX: # %bb.0: +; CHECK_AVX-NEXT: vmovups %ymm0, (%rax) # AlignMOV convert to UnAlignMOV +; +; CHECK_SSE32-LABEL: storev16i16: +; CHECK_SSE32: # %bb.0: +; CHECK_SSE32-NEXT: movaps %xmm1, (%eax) +; CHECK_SSE32-NEXT: movaps %xmm0, (%eax) +; +; CHECK_AVX32-LABEL: storev16i16: +; CHECK_AVX32: # %bb.0: +; CHECK_AVX32-NEXT: vmovups %ymm0, (%eax) # AlignMOV convert to UnAlignMOV + store <16 x i16> %a, <16 x i16>* undef, align 32 + unreachable +} + +define void @storev16i16_01(<16 x i16> %a) nounwind { +; CHECK_SSE-LABEL: storev16i16_01: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movups %xmm1, (%rax) +; CHECK_SSE-NEXT: movups %xmm0, (%rax) +; +; CHECK_AVX-LABEL: storev16i16_01: +; CHECK_AVX: # %bb.0: +; CHECK_AVX-NEXT: vmovups %ymm0, (%rax) +; +; CHECK_SSE32-LABEL: storev16i16_01: +; CHECK_SSE32: # %bb.0: +; CHECK_SSE32-NEXT: movups %xmm1, (%eax) +; CHECK_SSE32-NEXT: movups %xmm0, (%eax) +; +; CHECK_AVX32-LABEL: storev16i16_01: +; CHECK_AVX32: # %bb.0: +; CHECK_AVX32-NEXT: vmovups %ymm0, (%eax) + store <16 x i16> %a, <16 x i16>* undef, align 4 + unreachable +} + +define void @storev32i8(<32 x i8> %a) nounwind { +; CHECK_SSE-LABEL: storev32i8: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps %xmm1, (%rax) +; CHECK_SSE-NEXT: movaps %xmm0, (%rax) +; +; CHECK_AVX-LABEL: storev32i8: +; CHECK_AVX: # %bb.0: +; CHECK_AVX-NEXT: vmovups %ymm0, (%rax) # AlignMOV convert to UnAlignMOV +; +; CHECK_SSE32-LABEL: storev32i8: +; CHECK_SSE32: # %bb.0: +; CHECK_SSE32-NEXT: movaps %xmm1, (%eax) +; CHECK_SSE32-NEXT: movaps %xmm0, (%eax) +; +; CHECK_AVX32-LABEL: storev32i8: +; CHECK_AVX32: # %bb.0: +; CHECK_AVX32-NEXT: vmovups %ymm0, (%eax) # AlignMOV convert to UnAlignMOV + store <32 x i8> %a, <32 x i8>* undef, align 32 + unreachable +} + +define void @storev32i8_01(<32 x i8> %a) nounwind { +; CHECK_SSE-LABEL: storev32i8_01: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movups %xmm1, (%rax) +; CHECK_SSE-NEXT: movups %xmm0, (%rax) +; +; CHECK_AVX-LABEL: storev32i8_01: +; CHECK_AVX: # %bb.0: +; CHECK_AVX-NEXT: vmovups %ymm0, (%rax) +; +; CHECK_SSE32-LABEL: storev32i8_01: +; CHECK_SSE32: # %bb.0: +; CHECK_SSE32-NEXT: movups %xmm1, (%eax) +; CHECK_SSE32-NEXT: movups %xmm0, (%eax) +; +; CHECK_AVX32-LABEL: storev32i8_01: +; CHECK_AVX32: # %bb.0: +; CHECK_AVX32-NEXT: vmovups %ymm0, (%eax) + store <32 x i8> %a, <32 x i8>* undef, align 4 + unreachable +} + +; It is faster to make two saves, if the data is already in xmm registers. For +; example, after making an integer operation. +define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp { +; CHECK_SSE-LABEL: double_save: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps %xmm1, 16(%rdi) +; CHECK_SSE-NEXT: movaps %xmm0, (%rdi) +; CHECK_SSE-NEXT: retq +; +; CHECK_AVX-LABEL: double_save: +; CHECK_AVX: # %bb.0: +; CHECK_AVX-NEXT: vmovups %xmm1, 16(%rdi) # AlignMOV convert to UnAlignMOV +; CHECK_AVX-NEXT: vmovups %xmm0, (%rdi) # AlignMOV convert to UnAlignMOV +; CHECK_AVX-NEXT: retq +; +; CHECK_SSE32-LABEL: double_save: +; CHECK_SSE32: # %bb.0: +; CHECK_SSE32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK_SSE32-NEXT: movaps %xmm1, 16(%eax) +; CHECK_SSE32-NEXT: movaps %xmm0, (%eax) +; CHECK_SSE32-NEXT: retl +; +; CHECK_AVX32-LABEL: double_save: +; CHECK_AVX32: # %bb.0: +; CHECK_AVX32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK_AVX32-NEXT: vmovups %xmm1, 16(%eax) # AlignMOV convert to UnAlignMOV +; CHECK_AVX32-NEXT: vmovups %xmm0, (%eax) # AlignMOV convert to UnAlignMOV +; CHECK_AVX32-NEXT: retl + %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> + store <8 x i32> %Z, <8 x i32>* %P, align 16 + ret void +} + +define void @double_save_volatile(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind { +; CHECK_SSE-LABEL: double_save_volatile: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps %xmm1, 16(%rdi) +; CHECK_SSE-NEXT: movaps %xmm0, (%rdi) +; CHECK_SSE-NEXT: retq +; +; CHECK_AVX-LABEL: double_save_volatile: +; CHECK_AVX: # %bb.0: +; CHECK_AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK_AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK_AVX-NEXT: vmovups %ymm0, (%rdi) +; CHECK_AVX-NEXT: vzeroupper +; CHECK_AVX-NEXT: retq +; +; CHECK_SSE32-LABEL: double_save_volatile: +; CHECK_SSE32: # %bb.0: +; CHECK_SSE32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK_SSE32-NEXT: movaps %xmm1, 16(%eax) +; CHECK_SSE32-NEXT: movaps %xmm0, (%eax) +; CHECK_SSE32-NEXT: retl +; +; CHECK_AVX32-LABEL: double_save_volatile: +; CHECK_AVX32: # %bb.0: +; CHECK_AVX32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK_AVX32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK_AVX32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK_AVX32-NEXT: vmovups %ymm0, (%eax) +; CHECK_AVX32-NEXT: vzeroupper +; CHECK_AVX32-NEXT: retl + %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> + store volatile <8 x i32> %Z, <8 x i32>* %P, align 16 + ret void +} + +define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind { +; CHECK_SSE-LABEL: add8i32: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movups (%rsi), %xmm0 +; CHECK_SSE-NEXT: movups 16(%rsi), %xmm1 +; CHECK_SSE-NEXT: movups %xmm1, 16(%rdi) +; CHECK_SSE-NEXT: movups %xmm0, (%rdi) +; CHECK_SSE-NEXT: retq +; +; CHECK_AVX-LABEL: add8i32: +; CHECK_AVX: # %bb.0: +; CHECK_AVX-NEXT: vmovups (%rsi), %ymm0 +; CHECK_AVX-NEXT: vmovups %ymm0, (%rdi) +; CHECK_AVX-NEXT: vzeroupper +; CHECK_AVX-NEXT: retq +; +; CHECK_SSE32-LABEL: add8i32: +; CHECK_SSE32: # %bb.0: +; CHECK_SSE32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK_SSE32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK_SSE32-NEXT: movups (%ecx), %xmm0 +; CHECK_SSE32-NEXT: movups 16(%ecx), %xmm1 +; CHECK_SSE32-NEXT: movups %xmm1, 16(%eax) +; CHECK_SSE32-NEXT: movups %xmm0, (%eax) +; CHECK_SSE32-NEXT: retl +; +; CHECK_AVX32-LABEL: add8i32: +; CHECK_AVX32: # %bb.0: +; CHECK_AVX32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK_AVX32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK_AVX32-NEXT: vmovups (%ecx), %ymm0 +; CHECK_AVX32-NEXT: vmovups %ymm0, (%eax) +; CHECK_AVX32-NEXT: vzeroupper +; CHECK_AVX32-NEXT: retl + %b = load <8 x i32>, <8 x i32>* %bp, align 1 + %x = add <8 x i32> zeroinitializer, %b + store <8 x i32> %x, <8 x i32>* %ret, align 1 + ret void +} + +define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { +; CHECK_SSE-LABEL: add4i64a64: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps (%rsi), %xmm0 +; CHECK_SSE-NEXT: movaps 16(%rsi), %xmm1 +; CHECK_SSE-NEXT: movaps %xmm0, (%rdi) +; CHECK_SSE-NEXT: movaps %xmm1, 16(%rdi) +; CHECK_SSE-NEXT: retq +; +; CHECK_AVX-LABEL: add4i64a64: +; CHECK_AVX: # %bb.0: +; CHECK_AVX-NEXT: vmovups (%rsi), %ymm0 # AlignMOV convert to UnAlignMOV +; CHECK_AVX-NEXT: vmovups %ymm0, (%rdi) # AlignMOV convert to UnAlignMOV +; CHECK_AVX-NEXT: vzeroupper +; CHECK_AVX-NEXT: retq +; +; CHECK_SSE32-LABEL: add4i64a64: +; CHECK_SSE32: # %bb.0: +; CHECK_SSE32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK_SSE32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK_SSE32-NEXT: movaps (%ecx), %xmm0 +; CHECK_SSE32-NEXT: movaps 16(%ecx), %xmm1 +; CHECK_SSE32-NEXT: movaps %xmm0, (%eax) +; CHECK_SSE32-NEXT: movaps %xmm1, 16(%eax) +; CHECK_SSE32-NEXT: retl +; +; CHECK_AVX32-LABEL: add4i64a64: +; CHECK_AVX32: # %bb.0: +; CHECK_AVX32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK_AVX32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK_AVX32-NEXT: vmovups (%ecx), %ymm0 # AlignMOV convert to UnAlignMOV +; CHECK_AVX32-NEXT: vmovups %ymm0, (%eax) # AlignMOV convert to UnAlignMOV +; CHECK_AVX32-NEXT: vzeroupper +; CHECK_AVX32-NEXT: retl + %b = load <4 x i64>, <4 x i64>* %bp, align 64 + %x = add <4 x i64> zeroinitializer, %b + store <4 x i64> %x, <4 x i64>* %ret, align 64 + ret void +} + +define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { +; CHECK_SSE-LABEL: add4i64a16: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps (%rsi), %xmm0 +; CHECK_SSE-NEXT: movaps 16(%rsi), %xmm1 +; CHECK_SSE-NEXT: movaps %xmm1, 16(%rdi) +; CHECK_SSE-NEXT: movaps %xmm0, (%rdi) +; CHECK_SSE-NEXT: retq +; +; CHECK_AVX-LABEL: add4i64a16: +; CHECK_AVX: # %bb.0: +; CHECK_AVX-NEXT: vmovups (%rsi), %ymm0 +; CHECK_AVX-NEXT: vmovups %ymm0, (%rdi) +; CHECK_AVX-NEXT: vzeroupper +; CHECK_AVX-NEXT: retq +; +; CHECK_SSE32-LABEL: add4i64a16: +; CHECK_SSE32: # %bb.0: +; CHECK_SSE32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK_SSE32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK_SSE32-NEXT: movaps (%ecx), %xmm0 +; CHECK_SSE32-NEXT: movaps 16(%ecx), %xmm1 +; CHECK_SSE32-NEXT: movaps %xmm1, 16(%eax) +; CHECK_SSE32-NEXT: movaps %xmm0, (%eax) +; CHECK_SSE32-NEXT: retl +; +; CHECK_AVX32-LABEL: add4i64a16: +; CHECK_AVX32: # %bb.0: +; CHECK_AVX32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK_AVX32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK_AVX32-NEXT: vmovups (%ecx), %ymm0 +; CHECK_AVX32-NEXT: vmovups %ymm0, (%eax) +; CHECK_AVX32-NEXT: vzeroupper +; CHECK_AVX32-NEXT: retl + %b = load <4 x i64>, <4 x i64>* %bp, align 16 + %x = add <4 x i64> zeroinitializer, %b + store <4 x i64> %x, <4 x i64>* %ret, align 16 + ret void +} diff --git a/llvm/test/CodeGen/X86/avx512-unaligned-load-store.ll b/llvm/test/CodeGen/X86/avx512-unaligned-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-unaligned-load-store.ll @@ -0,0 +1,595 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-enable-unaligned-vector-move -mattr=avx512f | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -x86-enable-unaligned-vector-move -mattr=avx512f | FileCheck %s -check-prefix=X86 + +define <16 x i32> @test17(i8 * %addr) { +; X64-LABEL: test17: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %zmm0 # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test17: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %zmm0 # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <16 x i32>* + %res = load <16 x i32>, <16 x i32>* %vaddr, align 64 + ret <16 x i32>%res +} + +define void @test18(i8 * %addr, <8 x i64> %data) { +; X64-LABEL: test18: +; X64: # %bb.0: +; X64-NEXT: vmovups %zmm0, (%rdi) # AlignMOV convert to UnAlignMOV +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test18: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %zmm0, (%eax) # AlignMOV convert to UnAlignMOV +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x i64>* + store <8 x i64>%data, <8 x i64>* %vaddr, align 64 + ret void +} + +define void @test19(i8 * %addr, <16 x i32> %data) { +; X64-LABEL: test19: +; X64: # %bb.0: +; X64-NEXT: vmovups %zmm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test19: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %zmm0, (%eax) +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <16 x i32>* + store <16 x i32>%data, <16 x i32>* %vaddr, align 1 + ret void +} + +define void @test20(i8 * %addr, <16 x i32> %data) { +; X64-LABEL: test20: +; X64: # %bb.0: +; X64-NEXT: vmovups %zmm0, (%rdi) # AlignMOV convert to UnAlignMOV +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test20: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %zmm0, (%eax) # AlignMOV convert to UnAlignMOV +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <16 x i32>* + store <16 x i32>%data, <16 x i32>* %vaddr, align 64 + ret void +} + +define <8 x i64> @test21(i8 * %addr) { +; X64-LABEL: test21: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %zmm0 # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test21: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %zmm0 # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x i64>* + %res = load <8 x i64>, <8 x i64>* %vaddr, align 64 + ret <8 x i64>%res +} + +define void @test22(i8 * %addr, <8 x i64> %data) { +; X64-LABEL: test22: +; X64: # %bb.0: +; X64-NEXT: vmovups %zmm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test22: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %zmm0, (%eax) +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x i64>* + store <8 x i64>%data, <8 x i64>* %vaddr, align 1 + ret void +} + +define <8 x i64> @test23(i8 * %addr) { +; X64-LABEL: test23: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: test23: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %zmm0 +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x i64>* + %res = load <8 x i64>, <8 x i64>* %vaddr, align 1 + ret <8 x i64>%res +} + +define void @test24(i8 * %addr, <8 x double> %data) { +; X64-LABEL: test24: +; X64: # %bb.0: +; X64-NEXT: vmovups %zmm0, (%rdi) # AlignMOV convert to UnAlignMOV +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test24: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %zmm0, (%eax) # AlignMOV convert to UnAlignMOV +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x double>* + store <8 x double>%data, <8 x double>* %vaddr, align 64 + ret void +} + +define <8 x double> @test25(i8 * %addr) { +; X64-LABEL: test25: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %zmm0 # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test25: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %zmm0 # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x double>* + %res = load <8 x double>, <8 x double>* %vaddr, align 64 + ret <8 x double>%res +} + +define void @test26(i8 * %addr, <16 x float> %data) { +; X64-LABEL: test26: +; X64: # %bb.0: +; X64-NEXT: vmovups %zmm0, (%rdi) # AlignMOV convert to UnAlignMOV +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test26: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %zmm0, (%eax) # AlignMOV convert to UnAlignMOV +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <16 x float>* + store <16 x float>%data, <16 x float>* %vaddr, align 64 + ret void +} + +define <16 x float> @test27(i8 * %addr) { +; X64-LABEL: test27: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %zmm0 # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test27: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %zmm0 # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <16 x float>* + %res = load <16 x float>, <16 x float>* %vaddr, align 64 + ret <16 x float>%res +} + +define void @test28(i8 * %addr, <8 x double> %data) { +; X64-LABEL: test28: +; X64: # %bb.0: +; X64-NEXT: vmovups %zmm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test28: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %zmm0, (%eax) +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x double>* + store <8 x double>%data, <8 x double>* %vaddr, align 1 + ret void +} + +define <8 x double> @test29(i8 * %addr) { +; X64-LABEL: test29: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: test29: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %zmm0 +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x double>* + %res = load <8 x double>, <8 x double>* %vaddr, align 1 + ret <8 x double>%res +} + +define void @test30(i8 * %addr, <16 x float> %data) { +; X64-LABEL: test30: +; X64: # %bb.0: +; X64-NEXT: vmovups %zmm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test30: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %zmm0, (%eax) +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <16 x float>* + store <16 x float>%data, <16 x float>* %vaddr, align 1 + ret void +} + +define <16 x float> @test31(i8 * %addr) { +; X64-LABEL: test31: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: test31: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %zmm0 +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <16 x float>* + %res = load <16 x float>, <16 x float>* %vaddr, align 1 + ret <16 x float>%res +} + +define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { +; X64-LABEL: test32: +; X64: # %bb.0: +; X64-NEXT: vptestmd %zmm1, %zmm1, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %zmm1, %zmm1, %k1 +; X86-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>, <16 x i32>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old + ret <16 x i32>%res +} + +define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { +; X64-LABEL: test33: +; X64: # %bb.0: +; X64-NEXT: vptestmd %zmm1, %zmm1, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: test33: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %zmm1, %zmm1, %k1 +; X86-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} +; X86-NEXT: retl + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>, <16 x i32>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old + ret <16 x i32>%res +} + +define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) { +; X64-LABEL: test34: +; X64: # %bb.0: +; X64-NEXT: vptestmd %zmm0, %zmm0, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test34: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %zmm0, %zmm0, %k1 +; X86-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>, <16 x i32>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer + ret <16 x i32>%res +} + +define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) { +; X64-LABEL: test35: +; X64: # %bb.0: +; X64-NEXT: vptestmd %zmm0, %zmm0, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test35: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %zmm0, %zmm0, %k1 +; X86-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>, <16 x i32>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer + ret <16 x i32>%res +} + +define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { +; X64-LABEL: test36: +; X64: # %bb.0: +; X64-NEXT: vptestmq %zmm1, %zmm1, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test36: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %zmm1, %zmm1, %k1 +; X86-NEXT: vmovdqu64 (%eax), %zmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>, <8 x i64>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old + ret <8 x i64>%res +} + +define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { +; X64-LABEL: test37: +; X64: # %bb.0: +; X64-NEXT: vptestmq %zmm1, %zmm1, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: test37: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %zmm1, %zmm1, %k1 +; X86-NEXT: vmovdqu64 (%eax), %zmm0 {%k1} +; X86-NEXT: retl + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>, <8 x i64>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old + ret <8 x i64>%res +} + +define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) { +; X64-LABEL: test38: +; X64: # %bb.0: +; X64-NEXT: vptestmq %zmm0, %zmm0, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test38: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %zmm0, %zmm0, %k1 +; X86-NEXT: vmovdqu64 (%eax), %zmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>, <8 x i64>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer + ret <8 x i64>%res +} + +define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) { +; X64-LABEL: test39: +; X64: # %bb.0: +; X64-NEXT: vptestmq %zmm0, %zmm0, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test39: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %zmm0, %zmm0, %k1 +; X86-NEXT: vmovdqu64 (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>, <8 x i64>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer + ret <8 x i64>%res +} + +define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { +; X64-LABEL: test40: +; X64: # %bb.0: +; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 +; X64-NEXT: vmovups (%rdi), %zmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test40: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 +; X86-NEXT: vmovups (%eax), %zmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>, <16 x float>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old + ret <16 x float>%res +} + +define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { +; X64-LABEL: test41: +; X64: # %bb.0: +; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 +; X64-NEXT: vmovups (%rdi), %zmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: test41: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 +; X86-NEXT: vmovups (%eax), %zmm0 {%k1} +; X86-NEXT: retl + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>, <16 x float>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old + ret <16 x float>%res +} + +define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) { +; X64-LABEL: test42: +; X64: # %bb.0: +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 +; X64-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test42: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 +; X86-NEXT: vmovups (%eax), %zmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>, <16 x float>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer + ret <16 x float>%res +} + +define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) { +; X64-LABEL: test43: +; X64: # %bb.0: +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 +; X64-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test43: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 +; X86-NEXT: vmovups (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>, <16 x float>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer + ret <16 x float>%res +} + +define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { +; X64-LABEL: test44: +; X64: # %bb.0: +; X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 +; X64-NEXT: vmovupd (%rdi), %zmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test44: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; X86-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 +; X86-NEXT: vmovupd (%eax), %zmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>, <8 x double>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old + ret <8 x double>%res +} + +define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { +; X64-LABEL: test45: +; X64: # %bb.0: +; X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 +; X64-NEXT: vmovupd (%rdi), %zmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: test45: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; X86-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 +; X86-NEXT: vmovupd (%eax), %zmm0 {%k1} +; X86-NEXT: retl + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>, <8 x double>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old + ret <8 x double>%res +} + +define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) { +; X64-LABEL: test46: +; X64: # %bb.0: +; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X64-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 +; X64-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test46: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 +; X86-NEXT: vmovupd (%eax), %zmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>, <8 x double>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer + ret <8 x double>%res +} + +define <8 x double> @test47(i8 * %addr, <8 x double> %mask1) { +; X64-LABEL: test47: +; X64: # %bb.0: +; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X64-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 +; X64-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test47: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 +; X86-NEXT: vmovupd (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>, <8 x double>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer + ret <8 x double>%res +} diff --git a/llvm/test/CodeGen/X86/avx512vl-unaligned-load-store.ll b/llvm/test/CodeGen/X86/avx512vl-unaligned-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512vl-unaligned-load-store.ll @@ -0,0 +1,747 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f -mattr=avx512vl -x86-enable-unaligned-vector-move | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512f -mattr=avx512vl -x86-enable-unaligned-vector-move | FileCheck %s -check-prefix=X86 + +define <8 x i32> @test_256_1(i8 * %addr) { +; CHECK-LABEL: test_256_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq +; X64-LABEL: test_256_1: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %ymm0 +; X64-NEXT: retq +; +; X86-LABEL: test_256_1: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %ymm0 +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x i32>* + %res = load <8 x i32>, <8 x i32>* %vaddr, align 1 + ret <8 x i32>%res +} + +define <8 x i32> @test_256_2(i8 * %addr) { +; CHECK-LABEL: test_256_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_256_2: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %ymm0 # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_256_2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %ymm0 # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x i32>* + %res = load <8 x i32>, <8 x i32>* %vaddr, align 32 + ret <8 x i32>%res +} + +define void @test_256_3(i8 * %addr, <4 x i64> %data) { +; CHECK-LABEL: test_256_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; X64-LABEL: test_256_3: +; X64: # %bb.0: +; X64-NEXT: vmovups %ymm0, (%rdi) # AlignMOV convert to UnAlignMOV +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test_256_3: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %ymm0, (%eax) # AlignMOV convert to UnAlignMOV +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <4 x i64>* + store <4 x i64>%data, <4 x i64>* %vaddr, align 32 + ret void +} + +define void @test_256_4(i8 * %addr, <8 x i32> %data) { +; CHECK-LABEL: test_256_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; X64-LABEL: test_256_4: +; X64: # %bb.0: +; X64-NEXT: vmovups %ymm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test_256_4: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %ymm0, (%eax) +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x i32>* + store <8 x i32>%data, <8 x i32>* %vaddr, align 1 + ret void +} + +define void @test_256_5(i8 * %addr, <8 x i32> %data) { +; CHECK-LABEL: test_256_5: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; X64-LABEL: test_256_5: +; X64: # %bb.0: +; X64-NEXT: vmovups %ymm0, (%rdi) # AlignMOV convert to UnAlignMOV +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test_256_5: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %ymm0, (%eax) # AlignMOV convert to UnAlignMOV +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x i32>* + store <8 x i32>%data, <8 x i32>* %vaddr, align 32 + ret void +} + +define <4 x i64> @test_256_6(i8 * %addr) { +; CHECK-LABEL: test_256_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_256_6: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %ymm0 # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_256_6: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %ymm0 # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <4 x i64>* + %res = load <4 x i64>, <4 x i64>* %vaddr, align 32 + ret <4 x i64>%res +} + +define void @test_256_7(i8 * %addr, <4 x i64> %data) { +; CHECK-LABEL: test_256_7: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; X64-LABEL: test_256_7: +; X64: # %bb.0: +; X64-NEXT: vmovups %ymm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test_256_7: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %ymm0, (%eax) +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <4 x i64>* + store <4 x i64>%data, <4 x i64>* %vaddr, align 1 + ret void +} + +define <4 x i64> @test_256_8(i8 * %addr) { +; CHECK-LABEL: test_256_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq +; X64-LABEL: test_256_8: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %ymm0 +; X64-NEXT: retq +; +; X86-LABEL: test_256_8: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %ymm0 +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <4 x i64>* + %res = load <4 x i64>, <4 x i64>* %vaddr, align 1 + ret <4 x i64>%res +} + +define void @test_256_9(i8 * %addr, <4 x double> %data) { +; CHECK-LABEL: test_256_9: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; X64-LABEL: test_256_9: +; X64: # %bb.0: +; X64-NEXT: vmovups %ymm0, (%rdi) # AlignMOV convert to UnAlignMOV +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test_256_9: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %ymm0, (%eax) # AlignMOV convert to UnAlignMOV +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <4 x double>* + store <4 x double>%data, <4 x double>* %vaddr, align 32 + ret void +} + +define <4 x double> @test_256_10(i8 * %addr) { +; CHECK-LABEL: test_256_10: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_256_10: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %ymm0 # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_256_10: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %ymm0 # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <4 x double>* + %res = load <4 x double>, <4 x double>* %vaddr, align 32 + ret <4 x double>%res +} + +define void @test_256_11(i8 * %addr, <8 x float> %data) { +; CHECK-LABEL: test_256_11: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; X64-LABEL: test_256_11: +; X64: # %bb.0: +; X64-NEXT: vmovups %ymm0, (%rdi) # AlignMOV convert to UnAlignMOV +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test_256_11: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %ymm0, (%eax) # AlignMOV convert to UnAlignMOV +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x float>* + store <8 x float>%data, <8 x float>* %vaddr, align 32 + ret void +} + +define <8 x float> @test_256_12(i8 * %addr) { +; CHECK-LABEL: test_256_12: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_256_12: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %ymm0 # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_256_12: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %ymm0 # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x float>* + %res = load <8 x float>, <8 x float>* %vaddr, align 32 + ret <8 x float>%res +} + +define void @test_256_13(i8 * %addr, <4 x double> %data) { +; CHECK-LABEL: test_256_13: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; X64-LABEL: test_256_13: +; X64: # %bb.0: +; X64-NEXT: vmovups %ymm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test_256_13: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %ymm0, (%eax) +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <4 x double>* + store <4 x double>%data, <4 x double>* %vaddr, align 1 + ret void +} + +define <4 x double> @test_256_14(i8 * %addr) { +; CHECK-LABEL: test_256_14: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq +; X64-LABEL: test_256_14: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %ymm0 +; X64-NEXT: retq +; +; X86-LABEL: test_256_14: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %ymm0 +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <4 x double>* + %res = load <4 x double>, <4 x double>* %vaddr, align 1 + ret <4 x double>%res +} + +define void @test_256_15(i8 * %addr, <8 x float> %data) { +; CHECK-LABEL: test_256_15: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; X64-LABEL: test_256_15: +; X64: # %bb.0: +; X64-NEXT: vmovups %ymm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: test_256_15: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups %ymm0, (%eax) +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x float>* + store <8 x float>%data, <8 x float>* %vaddr, align 1 + ret void +} + +define <8 x float> @test_256_16(i8 * %addr) { +; CHECK-LABEL: test_256_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq +; X64-LABEL: test_256_16: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %ymm0 +; X64-NEXT: retq +; +; X86-LABEL: test_256_16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %ymm0 +; X86-NEXT: retl + %vaddr = bitcast i8* %addr to <8 x float>* + %res = load <8 x float>, <8 x float>* %vaddr, align 1 + ret <8 x float>%res +} + +define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { +; CHECK-LABEL: test_256_17: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_256_17: +; X64: # %bb.0: +; X64-NEXT: vptestmd %ymm1, %ymm1, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_256_17: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %ymm1, %ymm1, %k1 +; X86-NEXT: vmovdqu32 (%eax), %ymm0 {%k1} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>, <8 x i32>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old + ret <8 x i32>%res +} + +define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { +; CHECK-LABEL: test_256_18: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq +; X64-LABEL: test_256_18: +; X64: # %bb.0: +; X64-NEXT: vptestmd %ymm1, %ymm1, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: test_256_18: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %ymm1, %ymm1, %k1 +; X86-NEXT: vmovdqu32 (%eax), %ymm0 {%k1} +; X86-NEXT: retl + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>, <8 x i32>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old + ret <8 x i32>%res +} + +define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) { +; CHECK-LABEL: test_256_19: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %ymm0, %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_256_19: +; X64: # %bb.0: +; X64-NEXT: vptestmd %ymm0, %ymm0, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_256_19: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %ymm0, %ymm0, %k1 +; X86-NEXT: vmovdqu32 (%eax), %ymm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>, <8 x i32>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer + ret <8 x i32>%res +} + +define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) { +; CHECK-LABEL: test_256_20: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %ymm0, %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} +; CHECK-NEXT: retq +; X64-LABEL: test_256_20: +; X64: # %bb.0: +; X64-NEXT: vptestmd %ymm0, %ymm0, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_256_20: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %ymm0, %ymm0, %k1 +; X86-NEXT: vmovdqu32 (%eax), %ymm0 {%k1} {z} +; X86-NEXT: retl + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>, <8 x i32>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer + ret <8 x i32>%res +} + +define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { +; CHECK-LABEL: test_256_21: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_256_21: +; X64: # %bb.0: +; X64-NEXT: vptestmq %ymm1, %ymm1, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_256_21: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %ymm1, %ymm1, %k1 +; X86-NEXT: vmovdqu64 (%eax), %ymm0 {%k1} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>, <4 x i64>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old + ret <4 x i64>%res +} + +define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { +; CHECK-LABEL: test_256_22: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq +; X64-LABEL: test_256_22: +; X64: # %bb.0: +; X64-NEXT: vptestmq %ymm1, %ymm1, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: test_256_22: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %ymm1, %ymm1, %k1 +; X86-NEXT: vmovdqu64 (%eax), %ymm0 {%k1} +; X86-NEXT: retl + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>, <4 x i64>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old + ret <4 x i64>%res +} + +define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) { +; CHECK-LABEL: test_256_23: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_256_23: +; X64: # %bb.0: +; X64-NEXT: vptestmq %ymm0, %ymm0, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_256_23: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %ymm0, %ymm0, %k1 +; X86-NEXT: vmovdqu64 (%eax), %ymm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>, <4 x i64>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer + ret <4 x i64>%res +} + +define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) { +; CHECK-LABEL: test_256_24: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} +; CHECK-NEXT: retq +; X64-LABEL: test_256_24: +; X64: # %bb.0: +; X64-NEXT: vptestmq %ymm0, %ymm0, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_256_24: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %ymm0, %ymm0, %k1 +; X86-NEXT: vmovdqu64 (%eax), %ymm0 {%k1} {z} +; X86-NEXT: retl + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>, <4 x i64>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer + ret <4 x i64>%res +} + +define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { +; CHECK-LABEL: test_128_17: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_128_17: +; X64: # %bb.0: +; X64-NEXT: vptestmd %xmm1, %xmm1, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_128_17: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %xmm1, %xmm1, %k1 +; X86-NEXT: vmovdqu32 (%eax), %xmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>, <4 x i32>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old + ret <4 x i32>%res +} + +define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { +; CHECK-LABEL: test_128_18: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq +; X64-LABEL: test_128_18: +; X64: # %bb.0: +; X64-NEXT: vptestmd %xmm1, %xmm1, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: test_128_18: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %xmm1, %xmm1, %k1 +; X86-NEXT: vmovdqu32 (%eax), %xmm0 {%k1} +; X86-NEXT: retl + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>, <4 x i32>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old + ret <4 x i32>%res +} + +define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) { +; CHECK-LABEL: test_128_19: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_128_19: +; X64: # %bb.0: +; X64-NEXT: vptestmd %xmm0, %xmm0, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_128_19: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %xmm0, %xmm0, %k1 +; X86-NEXT: vmovdqu32 (%eax), %xmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>, <4 x i32>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer + ret <4 x i32>%res +} + +define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) { +; CHECK-LABEL: test_128_20: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} {z} +; CHECK-NEXT: retq +; X64-LABEL: test_128_20: +; X64: # %bb.0: +; X64-NEXT: vptestmd %xmm0, %xmm0, %k1 +; X64-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_128_20: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmd %xmm0, %xmm0, %k1 +; X86-NEXT: vmovdqu32 (%eax), %xmm0 {%k1} {z} +; X86-NEXT: retl + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>, <4 x i32>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer + ret <4 x i32>%res +} + +define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { +; CHECK-LABEL: test_128_21: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_128_21: +; X64: # %bb.0: +; X64-NEXT: vptestmq %xmm1, %xmm1, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_128_21: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %xmm1, %xmm1, %k1 +; X86-NEXT: vmovdqu64 (%eax), %xmm0 {%k1} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>, <2 x i64>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old + ret <2 x i64>%res +} + +define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { +; CHECK-LABEL: test_128_22: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq +; X64-LABEL: test_128_22: +; X64: # %bb.0: +; X64-NEXT: vptestmq %xmm1, %xmm1, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: test_128_22: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %xmm1, %xmm1, %k1 +; X86-NEXT: vmovdqu64 (%eax), %xmm0 {%k1} +; X86-NEXT: retl + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>, <2 x i64>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old + ret <2 x i64>%res +} + +define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) { +; CHECK-LABEL: test_128_23: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; CHECK-NEXT: retq +; X64-LABEL: test_128_23: +; X64: # %bb.0: +; X64-NEXT: vptestmq %xmm0, %xmm0, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X64-NEXT: retq +; +; X86-LABEL: test_128_23: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %xmm0, %xmm0, %k1 +; X86-NEXT: vmovdqu64 (%eax), %xmm0 {%k1} {z} # AlignMOV convert to UnAlignMOV +; X86-NEXT: retl + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>, <2 x i64>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer + ret <2 x i64>%res +} + +define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) { +; CHECK-LABEL: test_128_24: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} {z} +; CHECK-NEXT: retq +; X64-LABEL: test_128_24: +; X64: # %bb.0: +; X64-NEXT: vptestmq %xmm0, %xmm0, %k1 +; X64-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_128_24: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vptestmq %xmm0, %xmm0, %k1 +; X86-NEXT: vmovdqu64 (%eax), %xmm0 {%k1} {z} +; X86-NEXT: retl + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>, <2 x i64>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer + ret <2 x i64>%res +}