diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -70,6 +70,7 @@ X86TargetMachine.cpp X86TargetObjectFile.cpp X86TargetTransformInfo.cpp + X86UnalignedVectorMoves.cpp X86VZeroUpper.cpp X86WinAllocaExpander.cpp X86WinEHState.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -141,6 +141,7 @@ FunctionPass *createX86LoadValueInjectionRetHardeningPass(); FunctionPass *createX86SpeculativeLoadHardeningPass(); FunctionPass *createX86SpeculativeExecutionSideEffectSuppression(); +FunctionPass *createX86UnalignedVectorMoves(); void initializeEvexToVexInstPassPass(PassRegistry &); void initializeFixupBWInstPassPass(PassRegistry &); @@ -162,6 +163,7 @@ void initializeX86PartialReductionPass(PassRegistry &); void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); +void initializeX86UnalignedVectorMovePassPass(PassRegistry &); namespace X86AS { enum : unsigned { diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -83,6 +83,7 @@ initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); initializeX86PartialReductionPass(PR); + initializeX86UnalignedVectorMovePassPass(PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -524,6 +525,7 @@ addPass(createX86PadShortFunctions()); addPass(createX86FixupLEAs()); } + addPass(createX86UnalignedVectorMoves()); addPass(createX86EvexToVexInsts()); addPass(createX86DiscriminateMemOpsPass()); addPass(createX86InsertPrefetchPass()); diff --git a/llvm/lib/Target/X86/X86UnalignedVectorMoves.cpp b/llvm/lib/Target/X86/X86UnalignedVectorMoves.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/X86UnalignedVectorMoves.cpp @@ -0,0 +1,198 @@ +//===- X86UnalignedVectorMoves.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file defines the pass that replace aligned vector move with unaligned +/// vector move. Unaligned vector move achieve the same performance as aligned +/// vector move does when the address is aligned. +/// If the address is not aligned, unaligned vector move can run without +/// raising exception, but aligned vector move raises exception. Sometimes +/// user wants to suppress the exception, so an option is provided for this +/// purpose. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Pass.h" +#include +#include + +using namespace llvm; + +#define UNALIGNED_VEC_MOV_DESC "X86 unaligned vector move" +#define DEBUG_TYPE "x86-unaligned-vector-move" + +static cl::opt EnableX86UnalignedVecMov( + "x86-enable-unaligned-vector-move", cl::Hidden, + cl::desc("X86: Enable transforming aligned vector move instruction to " + "unaligned vector move."), + cl::init(false)); + +namespace { + +class X86UnalignedVectorMovePass : public MachineFunctionPass { + + bool alignedMovToUnalignedMov(MachineInstr &MI) const; + +public: + static char ID; + + X86UnalignedVectorMovePass() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return UNALIGNED_VEC_MOV_DESC; } + + bool runOnMachineFunction(MachineFunction &MF) override; + + // This pass runs after regalloc and doesn't support VReg operands. + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + /// Machine instruction info used throughout the class. + const X86InstrInfo *TII = nullptr; +}; + +} // end anonymous namespace + +char X86UnalignedVectorMovePass::ID = 0; + +bool X86UnalignedVectorMovePass::runOnMachineFunction(MachineFunction &MF) { + if (!EnableX86UnalignedVecMov) + return false; + + const X86Subtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + if (!ST.hasAVX()) + return false; + + bool Changed = false; + + /// Go over all basic blocks in function and replace + /// movaps with movups when possible. + for (MachineBasicBlock &MBB : MF) { + + // Traverse the basic block. + for (MachineInstr &MI : MBB) + Changed |= alignedMovToUnalignedMov(MI); + } + + return Changed; +} + +bool X86UnalignedVectorMovePass::alignedMovToUnalignedMov( + MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); + unsigned NewOpc; + + switch (Opc) { + default: + return false; + // Replace vmovaps with vmovups. + // MOVAPSmr and MOVAPSrm aren't used when AVX is enabled. + case X86::VMOVAPSYmr: NewOpc = X86::VMOVUPSYmr; break; + case X86::VMOVAPSYrm: NewOpc = X86::VMOVUPSYrm; break; + case X86::VMOVAPSZ128mr: NewOpc = X86::VMOVUPSZ128mr; break; + case X86::VMOVAPSZ128mr_NOVLX: NewOpc = X86::VMOVUPSZ128mr_NOVLX; break; + case X86::VMOVAPSZ128mrk: NewOpc = X86::VMOVUPSZ128mrk; break; + case X86::VMOVAPSZ128rm: NewOpc = X86::VMOVUPSZ128rm; break; + case X86::VMOVAPSZ128rm_NOVLX: NewOpc = X86::VMOVUPSZ128rm_NOVLX; break; + case X86::VMOVAPSZ128rmk: NewOpc = X86::VMOVUPSZ128rmk; break; + case X86::VMOVAPSZ128rmkz: NewOpc = X86::VMOVUPSZ128rmkz; break; + case X86::VMOVAPSZ256mr: NewOpc = X86::VMOVUPSZ256mr; break; + case X86::VMOVAPSZ256mr_NOVLX: NewOpc = X86::VMOVUPSZ256mr_NOVLX; break; + case X86::VMOVAPSZ256mrk: NewOpc = X86::VMOVUPSZ256mrk; break; + case X86::VMOVAPSZ256rm: NewOpc = X86::VMOVUPSZ256rm; break; + case X86::VMOVAPSZ256rm_NOVLX: NewOpc = X86::VMOVUPSZ256rm_NOVLX; break; + case X86::VMOVAPSZ256rmk: NewOpc = X86::VMOVUPSZ256rmk; break; + case X86::VMOVAPSZ256rmkz: NewOpc = X86::VMOVUPSZ256rmkz; break; + case X86::VMOVAPSZmr: NewOpc = X86::VMOVUPSZmr; break; + case X86::VMOVAPSZmrk: NewOpc = X86::VMOVUPSZmrk; break; + case X86::VMOVAPSZrm: NewOpc = X86::VMOVUPSZrm; break; + case X86::VMOVAPSZrmk: NewOpc = X86::VMOVUPSZrmk; break; + case X86::VMOVAPSZrmkz: NewOpc = X86::VMOVUPSZrmkz; break; + case X86::VMOVAPSmr: NewOpc = X86::VMOVUPSmr; break; + case X86::VMOVAPSrm: NewOpc = X86::VMOVUPSrm; break; + // Replace vmovapd with vmovupd. + // MOVAPDmr and MOVAPDrm aren't used when AVX is enabled. + case X86::VMOVAPDYmr: NewOpc = X86::VMOVUPDYmr; break; + case X86::VMOVAPDYrm: NewOpc = X86::VMOVUPDYrm; break; + case X86::VMOVAPDZ128mr: NewOpc = X86::VMOVUPDZ128mr; break; + case X86::VMOVAPDZ128mrk: NewOpc = X86::VMOVUPDZ128mrk; break; + case X86::VMOVAPDZ128rm: NewOpc = X86::VMOVUPDZ128rm; break; + case X86::VMOVAPDZ128rmk: NewOpc = X86::VMOVUPDZ128rmk; break; + case X86::VMOVAPDZ128rmkz: NewOpc = X86::VMOVUPDZ128rmkz; break; + case X86::VMOVAPDZ256mr: NewOpc = X86::VMOVUPDZ256mr; break; + case X86::VMOVAPDZ256mrk: NewOpc = X86::VMOVUPDZ256mrk; break; + case X86::VMOVAPDZ256rm: NewOpc = X86::VMOVUPDZ256rm; break; + case X86::VMOVAPDZ256rmk: NewOpc = X86::VMOVUPDZ256rmk; break; + case X86::VMOVAPDZ256rmkz: NewOpc = X86::VMOVUPDZ256rmkz; break; + case X86::VMOVAPDZmr: NewOpc = X86::VMOVUPDZmr; break; + case X86::VMOVAPDZmrk: NewOpc = X86::VMOVUPDZmrk; break; + case X86::VMOVAPDZrm: NewOpc = X86::VMOVUPDZrm; break; + case X86::VMOVAPDZrmk: NewOpc = X86::VMOVUPDZrmk; break; + case X86::VMOVAPDZrmkz: NewOpc = X86::VMOVUPDZrmkz; break; + case X86::VMOVAPDmr: NewOpc = X86::VMOVUPDmr; break; + case X86::VMOVAPDrm: NewOpc = X86::VMOVUPDrm; break; + // Replace vmovdqa with vmovdqu. + // MOVDQAmr and MOVDQArm aren't used when AVX is enabled. + case X86::VMOVDQA32Z128mr: NewOpc = X86::VMOVDQU32Z128mr; break; + case X86::VMOVDQA32Z128mrk: NewOpc = X86::VMOVDQU32Z128mrk; break; + case X86::VMOVDQA32Z128rm: NewOpc = X86::VMOVDQU32Z128rm; break; + case X86::VMOVDQA32Z128rmk: NewOpc = X86::VMOVDQU32Z128rmk; break; + case X86::VMOVDQA32Z128rmkz: NewOpc = X86::VMOVDQU32Z128rmkz; break; + case X86::VMOVDQA32Z256mr: NewOpc = X86::VMOVDQU32Z256mr; break; + case X86::VMOVDQA32Z256mrk: NewOpc = X86::VMOVDQU32Z256mrk; break; + case X86::VMOVDQA32Z256rm: NewOpc = X86::VMOVDQU32Z256rm; break; + case X86::VMOVDQA32Z256rmk: NewOpc = X86::VMOVDQU32Z256rmk; break; + case X86::VMOVDQA32Z256rmkz: NewOpc = X86::VMOVDQU32Z256rmkz; break; + case X86::VMOVDQA32Zmr: NewOpc = X86::VMOVDQU32Zmr; break; + case X86::VMOVDQA32Zmrk: NewOpc = X86::VMOVDQU32Zmrk; break; + case X86::VMOVDQA32Zrm: NewOpc = X86::VMOVDQU32Zrm; break; + case X86::VMOVDQA32Zrmk: NewOpc = X86::VMOVDQU32Zrmk; break; + case X86::VMOVDQA32Zrmkz: NewOpc = X86::VMOVDQU32Zrmkz; break; + case X86::VMOVDQA64Z128mr: NewOpc = X86::VMOVDQU64Z128mr; break; + case X86::VMOVDQA64Z128mrk: NewOpc = X86::VMOVDQU64Z128mrk; break; + case X86::VMOVDQA64Z128rm: NewOpc = X86::VMOVDQU64Z128rm; break; + case X86::VMOVDQA64Z128rmk: NewOpc = X86::VMOVDQU64Z128rmk; break; + case X86::VMOVDQA64Z128rmkz: NewOpc = X86::VMOVDQU64Z128rmkz; break; + case X86::VMOVDQA64Z256mr: NewOpc = X86::VMOVDQU64Z256mr; break; + case X86::VMOVDQA64Z256mrk: NewOpc = X86::VMOVDQU64Z256mrk; break; + case X86::VMOVDQA64Z256rm: NewOpc = X86::VMOVDQU64Z256rm; break; + case X86::VMOVDQA64Z256rmk: NewOpc = X86::VMOVDQU64Z256rmk; break; + case X86::VMOVDQA64Z256rmkz: NewOpc = X86::VMOVDQU64Z256rmkz; break; + case X86::VMOVDQA64Zmr: NewOpc = X86::VMOVDQU64Zmr; break; + case X86::VMOVDQA64Zmrk: NewOpc = X86::VMOVDQU64Zmrk; break; + case X86::VMOVDQA64Zrm: NewOpc = X86::VMOVDQU64Zrm; break; + case X86::VMOVDQA64Zrmk: NewOpc = X86::VMOVDQU64Zrmk; break; + case X86::VMOVDQA64Zrmkz: NewOpc = X86::VMOVDQU64Zrmkz; break; + case X86::VMOVDQAYmr: NewOpc = X86::VMOVDQUYmr; break; + case X86::VMOVDQAYrm: NewOpc = X86::VMOVDQUYrm; break; + case X86::VMOVDQAmr: NewOpc = X86::VMOVDQUmr; break; + case X86::VMOVDQArm: NewOpc = X86::VMOVDQUrm; break; + } + + MI.setDesc(TII->get(NewOpc)); + return true; +} + +INITIALIZE_PASS(X86UnalignedVectorMovePass, DEBUG_TYPE, UNALIGNED_VEC_MOV_DESC, + false, false) + +FunctionPass *llvm::createX86UnalignedVectorMoves() { + return new X86UnalignedVectorMovePass(); +} diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -58,6 +58,7 @@ ; CHECK-NEXT: Implement the 'patchable-function' attribute ; CHECK-NEXT: X86 Indirect Branch Tracking ; CHECK-NEXT: X86 vzeroupper inserter +; CHECK-NEXT: X86 unaligned vector move ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possibl ; CHECK-NEXT: X86 Discriminate Memory Operands ; CHECK-NEXT: X86 Insert Cache Prefetches diff --git a/llvm/test/CodeGen/X86/avx-unaligned-load-store.ll b/llvm/test/CodeGen/X86/avx-unaligned-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx-unaligned-load-store.ll @@ -0,0 +1,244 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2,slow-unaligned-mem-16 -x86-enable-unaligned-vector-move | FileCheck %s -check-prefix=CHECK_SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,slow-unaligned-mem-32 -x86-enable-unaligned-vector-move | FileCheck %s + +define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind { +; CHECK_SSE-LABEL: test_256_load: +; CHECK_SSE: # %bb.0: # %entry +; CHECK_SSE-NEXT: pushq %r15 +; CHECK_SSE-NEXT: pushq %r14 +; CHECK_SSE-NEXT: pushq %rbx +; CHECK_SSE-NEXT: subq $96, %rsp +; CHECK_SSE-NEXT: movq %rdx, %r14 +; CHECK_SSE-NEXT: movq %rsi, %r15 +; CHECK_SSE-NEXT: movq %rdi, %rbx +; CHECK_SSE-NEXT: movaps (%rdx), %xmm4 +; CHECK_SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK_SSE-NEXT: movaps 16(%rdx), %xmm5 +; CHECK_SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK_SSE-NEXT: movaps (%rsi), %xmm2 +; CHECK_SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK_SSE-NEXT: movaps 16(%rsi), %xmm3 +; CHECK_SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK_SSE-NEXT: movaps (%rdi), %xmm0 +; CHECK_SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK_SSE-NEXT: movaps 16(%rdi), %xmm1 +; CHECK_SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; CHECK_SSE-NEXT: callq dummy +; CHECK_SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, (%rbx) +; CHECK_SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, 16(%rbx) +; CHECK_SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, (%r15) +; CHECK_SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, 16(%r15) +; CHECK_SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, (%r14) +; CHECK_SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK_SSE-NEXT: movaps %xmm0, 16(%r14) +; CHECK_SSE-NEXT: addq $96, %rsp +; CHECK_SSE-NEXT: popq %rbx +; CHECK_SSE-NEXT: popq %r14 +; CHECK_SSE-NEXT: popq %r15 +; CHECK_SSE-NEXT: retq +; +; CHECK-LABEL: test_256_load: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $96, %rsp +; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movq %rsi, %r15 +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovups (%rsi), %ymm1 +; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovups (%rdx), %ymm2 +; CHECK-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; CHECK-NEXT: callq dummy +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovups %ymm0, (%rbx) +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovups %ymm0, (%r15) +; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovups %ymm0, (%r14) +; CHECK-NEXT: addq $96, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast double* %d to <4 x double>* + %tmp1.i = load <4 x double>, <4 x double>* %0, align 32 + %1 = bitcast float* %f to <8 x float>* + %tmp1.i17 = load <8 x float>, <8 x float>* %1, align 32 + %tmp1.i16 = load <4 x i64>, <4 x i64>* %i, align 32 + tail call void @dummy(<4 x double> %tmp1.i, <8 x float> %tmp1.i17, <4 x i64> %tmp1.i16) nounwind + store <4 x double> %tmp1.i, <4 x double>* %0, align 32 + store <8 x float> %tmp1.i17, <8 x float>* %1, align 32 + store <4 x i64> %tmp1.i16, <4 x i64>* %i, align 32 + ret void +} + +declare void @dummy(<4 x double>, <8 x float>, <4 x i64>) + +define void @storev16i16(<16 x i16> %a) nounwind { +; CHECK_SSE-LABEL: storev16i16: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps %xmm1, (%rax) +; CHECK_SSE-NEXT: movaps %xmm0, (%rax) +; +; CHECK-LABEL: storev16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rax) + store <16 x i16> %a, <16 x i16>* undef, align 32 + unreachable +} + +define void @storev16i16_01(<16 x i16> %a) nounwind { +; CHECK_SSE-LABEL: storev16i16_01: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movups %xmm1, (%rax) +; CHECK_SSE-NEXT: movups %xmm0, (%rax) +; +; CHECK-LABEL: storev16i16_01: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax) +; CHECK-NEXT: vmovups %xmm0, (%rax) + store <16 x i16> %a, <16 x i16>* undef, align 4 + unreachable +} + +define void @storev32i8(<32 x i8> %a) nounwind { +; CHECK_SSE-LABEL: storev32i8: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps %xmm1, (%rax) +; CHECK_SSE-NEXT: movaps %xmm0, (%rax) +; +; CHECK-LABEL: storev32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rax) + store <32 x i8> %a, <32 x i8>* undef, align 32 + unreachable +} + +define void @storev32i8_01(<32 x i8> %a) nounwind { +; CHECK_SSE-LABEL: storev32i8_01: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movups %xmm1, (%rax) +; CHECK_SSE-NEXT: movups %xmm0, (%rax) +; +; CHECK-LABEL: storev32i8_01: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax) +; CHECK-NEXT: vmovups %xmm0, (%rax) + store <32 x i8> %a, <32 x i8>* undef, align 4 + unreachable +} + +; It is faster to make two saves, if the data is already in xmm registers. For +; example, after making an integer operation. +define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp { +; CHECK_SSE-LABEL: double_save: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps %xmm1, 16(%rdi) +; CHECK_SSE-NEXT: movaps %xmm0, (%rdi) +; CHECK_SSE-NEXT: retq +; +; CHECK-LABEL: double_save: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %xmm1, 16(%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) +; CHECK-NEXT: retq + %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> + store <8 x i32> %Z, <8 x i32>* %P, align 16 + ret void +} + +define void @double_save_volatile(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind { +; CHECK_SSE-LABEL: double_save_volatile: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps %xmm1, 16(%rdi) +; CHECK_SSE-NEXT: movaps %xmm0, (%rdi) +; CHECK_SSE-NEXT: retq +; +; CHECK-LABEL: double_save_volatile: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> + store volatile <8 x i32> %Z, <8 x i32>* %P, align 16 + ret void +} + +define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind { +; CHECK_SSE-LABEL: add8i32: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movups (%rsi), %xmm0 +; CHECK_SSE-NEXT: movups 16(%rsi), %xmm1 +; CHECK_SSE-NEXT: movups %xmm1, 16(%rdi) +; CHECK_SSE-NEXT: movups %xmm0, (%rdi) +; CHECK_SSE-NEXT: retq +; +; CHECK-LABEL: add8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rsi), %xmm0 +; CHECK-NEXT: vmovups 16(%rsi), %xmm1 +; CHECK-NEXT: vmovups %xmm1, 16(%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) +; CHECK-NEXT: retq + %b = load <8 x i32>, <8 x i32>* %bp, align 1 + %x = add <8 x i32> zeroinitializer, %b + store <8 x i32> %x, <8 x i32>* %ret, align 1 + ret void +} + +define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { +; CHECK_SSE-LABEL: add4i64a64: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps (%rsi), %xmm0 +; CHECK_SSE-NEXT: movaps 16(%rsi), %xmm1 +; CHECK_SSE-NEXT: movaps %xmm0, (%rdi) +; CHECK_SSE-NEXT: movaps %xmm1, 16(%rdi) +; CHECK_SSE-NEXT: retq +; +; CHECK-LABEL: add4i64a64: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rsi), %ymm0 +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %b = load <4 x i64>, <4 x i64>* %bp, align 64 + %x = add <4 x i64> zeroinitializer, %b + store <4 x i64> %x, <4 x i64>* %ret, align 64 + ret void +} + +define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { +; CHECK_SSE-LABEL: add4i64a16: +; CHECK_SSE: # %bb.0: +; CHECK_SSE-NEXT: movaps (%rsi), %xmm0 +; CHECK_SSE-NEXT: movaps 16(%rsi), %xmm1 +; CHECK_SSE-NEXT: movaps %xmm1, 16(%rdi) +; CHECK_SSE-NEXT: movaps %xmm0, (%rdi) +; CHECK_SSE-NEXT: retq +; +; CHECK-LABEL: add4i64a16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rsi), %xmm0 +; CHECK-NEXT: vmovups 16(%rsi), %xmm1 +; CHECK-NEXT: vmovups %xmm1, 16(%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) +; CHECK-NEXT: retq + %b = load <4 x i64>, <4 x i64>* %bp, align 16 + %x = add <4 x i64> zeroinitializer, %b + store <4 x i64> %x, <4 x i64>* %ret, align 16 + ret void +} diff --git a/llvm/test/CodeGen/X86/avx512-unaligned-load-store.ll b/llvm/test/CodeGen/X86/avx512-unaligned-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-unaligned-load-store.ll @@ -0,0 +1,376 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f -x86-enable-unaligned-vector-move | FileCheck %s + +define <16 x i32> @test17(i8 * %addr) { +; CHECK-LABEL: test17: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %zmm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <16 x i32>* + %res = load <16 x i32>, <16 x i32>* %vaddr, align 64 + ret <16 x i32>%res +} + +define void @test18(i8 * %addr, <8 x i64> %data) { +; CHECK-LABEL: test18: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x i64>* + store <8 x i64>%data, <8 x i64>* %vaddr, align 64 + ret void +} + +define void @test19(i8 * %addr, <16 x i32> %data) { +; CHECK-LABEL: test19: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <16 x i32>* + store <16 x i32>%data, <16 x i32>* %vaddr, align 1 + ret void +} + +define void @test20(i8 * %addr, <16 x i32> %data) { +; CHECK-LABEL: test20: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <16 x i32>* + store <16 x i32>%data, <16 x i32>* %vaddr, align 64 + ret void +} + +define <8 x i64> @test21(i8 * %addr) { +; CHECK-LABEL: test21: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %zmm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x i64>* + %res = load <8 x i64>, <8 x i64>* %vaddr, align 64 + ret <8 x i64>%res +} + +define void @test22(i8 * %addr, <8 x i64> %data) { +; CHECK-LABEL: test22: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x i64>* + store <8 x i64>%data, <8 x i64>* %vaddr, align 1 + ret void +} + +define <8 x i64> @test23(i8 * %addr) { +; CHECK-LABEL: test23: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %zmm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x i64>* + %res = load <8 x i64>, <8 x i64>* %vaddr, align 1 + ret <8 x i64>%res +} + +define void @test24(i8 * %addr, <8 x double> %data) { +; CHECK-LABEL: test24: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x double>* + store <8 x double>%data, <8 x double>* %vaddr, align 64 + ret void +} + +define <8 x double> @test25(i8 * %addr) { +; CHECK-LABEL: test25: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %zmm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x double>* + %res = load <8 x double>, <8 x double>* %vaddr, align 64 + ret <8 x double>%res +} + +define void @test26(i8 * %addr, <16 x float> %data) { +; CHECK-LABEL: test26: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <16 x float>* + store <16 x float>%data, <16 x float>* %vaddr, align 64 + ret void +} + +define <16 x float> @test27(i8 * %addr) { +; CHECK-LABEL: test27: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %zmm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <16 x float>* + %res = load <16 x float>, <16 x float>* %vaddr, align 64 + ret <16 x float>%res +} + +define void @test28(i8 * %addr, <8 x double> %data) { +; CHECK-LABEL: test28: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x double>* + store <8 x double>%data, <8 x double>* %vaddr, align 1 + ret void +} + +define <8 x double> @test29(i8 * %addr) { +; CHECK-LABEL: test29: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %zmm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x double>* + %res = load <8 x double>, <8 x double>* %vaddr, align 1 + ret <8 x double>%res +} + +define void @test30(i8 * %addr, <16 x float> %data) { +; CHECK-LABEL: test30: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <16 x float>* + store <16 x float>%data, <16 x float>* %vaddr, align 1 + ret void +} + +define <16 x float> @test31(i8 * %addr) { +; CHECK-LABEL: test31: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %zmm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <16 x float>* + %res = load <16 x float>, <16 x float>* %vaddr, align 1 + ret <16 x float>%res +} + +define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { +; CHECK-LABEL: test32: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>, <16 x i32>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old + ret <16 x i32>%res +} + +define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { +; CHECK-LABEL: test33: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>, <16 x i32>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old + ret <16 x i32>%res +} + +define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) { +; CHECK-LABEL: test34: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>, <16 x i32>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer + ret <16 x i32>%res +} + +define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) { +; CHECK-LABEL: test35: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>, <16 x i32>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer + ret <16 x i32>%res +} + +define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { +; CHECK-LABEL: test36: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>, <8 x i64>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old + ret <8 x i64>%res +} + +define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { +; CHECK-LABEL: test37: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>, <8 x i64>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old + ret <8 x i64>%res +} + +define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) { +; CHECK-LABEL: test38: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>, <8 x i64>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer + ret <8 x i64>%res +} + +define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) { +; CHECK-LABEL: test39: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>, <8 x i64>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer + ret <8 x i64>%res +} + +define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { +; CHECK-LABEL: test40: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 +; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>, <16 x float>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old + ret <16 x float>%res +} + +define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { +; CHECK-LABEL: test41: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 +; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>, <16 x float>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old + ret <16 x float>%res +} + +define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) { +; CHECK-LABEL: test42: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>, <16 x float>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer + ret <16 x float>%res +} + +define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) { +; CHECK-LABEL: test43: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>, <16 x float>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer + ret <16 x float>%res +} + +define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { +; CHECK-LABEL: test44: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>, <8 x double>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old + ret <8 x double>%res +} + +define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { +; CHECK-LABEL: test45: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>, <8 x double>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old + ret <8 x double>%res +} + +define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) { +; CHECK-LABEL: test46: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>, <8 x double>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer + ret <8 x double>%res +} + +define <8 x double> @test47(i8 * %addr, <8 x double> %mask1) { +; CHECK-LABEL: test47: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>, <8 x double>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer + ret <8 x double>%res +} diff --git a/llvm/test/CodeGen/X86/avx512vl-unaligned-load-store.ll b/llvm/test/CodeGen/X86/avx512vl-unaligned-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512vl-unaligned-load-store.ll @@ -0,0 +1,378 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f -mattr=avx512vl -x86-enable-unaligned-vector-move | FileCheck %s + +define <8 x i32> @test_256_1(i8 * %addr) { +; CHECK-LABEL: test_256_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x i32>* + %res = load <8 x i32>, <8 x i32>* %vaddr, align 1 + ret <8 x i32>%res +} + +define <8 x i32> @test_256_2(i8 * %addr) { +; CHECK-LABEL: test_256_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x i32>* + %res = load <8 x i32>, <8 x i32>* %vaddr, align 32 + ret <8 x i32>%res +} + +define void @test_256_3(i8 * %addr, <4 x i64> %data) { +; CHECK-LABEL: test_256_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <4 x i64>* + store <4 x i64>%data, <4 x i64>* %vaddr, align 32 + ret void +} + +define void @test_256_4(i8 * %addr, <8 x i32> %data) { +; CHECK-LABEL: test_256_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x i32>* + store <8 x i32>%data, <8 x i32>* %vaddr, align 1 + ret void +} + +define void @test_256_5(i8 * %addr, <8 x i32> %data) { +; CHECK-LABEL: test_256_5: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x i32>* + store <8 x i32>%data, <8 x i32>* %vaddr, align 32 + ret void +} + +define <4 x i64> @test_256_6(i8 * %addr) { +; CHECK-LABEL: test_256_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <4 x i64>* + %res = load <4 x i64>, <4 x i64>* %vaddr, align 32 + ret <4 x i64>%res +} + +define void @test_256_7(i8 * %addr, <4 x i64> %data) { +; CHECK-LABEL: test_256_7: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <4 x i64>* + store <4 x i64>%data, <4 x i64>* %vaddr, align 1 + ret void +} + +define <4 x i64> @test_256_8(i8 * %addr) { +; CHECK-LABEL: test_256_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <4 x i64>* + %res = load <4 x i64>, <4 x i64>* %vaddr, align 1 + ret <4 x i64>%res +} + +define void @test_256_9(i8 * %addr, <4 x double> %data) { +; CHECK-LABEL: test_256_9: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <4 x double>* + store <4 x double>%data, <4 x double>* %vaddr, align 32 + ret void +} + +define <4 x double> @test_256_10(i8 * %addr) { +; CHECK-LABEL: test_256_10: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <4 x double>* + %res = load <4 x double>, <4 x double>* %vaddr, align 32 + ret <4 x double>%res +} + +define void @test_256_11(i8 * %addr, <8 x float> %data) { +; CHECK-LABEL: test_256_11: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x float>* + store <8 x float>%data, <8 x float>* %vaddr, align 32 + ret void +} + +define <8 x float> @test_256_12(i8 * %addr) { +; CHECK-LABEL: test_256_12: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x float>* + %res = load <8 x float>, <8 x float>* %vaddr, align 32 + ret <8 x float>%res +} + +define void @test_256_13(i8 * %addr, <4 x double> %data) { +; CHECK-LABEL: test_256_13: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <4 x double>* + store <4 x double>%data, <4 x double>* %vaddr, align 1 + ret void +} + +define <4 x double> @test_256_14(i8 * %addr) { +; CHECK-LABEL: test_256_14: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <4 x double>* + %res = load <4 x double>, <4 x double>* %vaddr, align 1 + ret <4 x double>%res +} + +define void @test_256_15(i8 * %addr, <8 x float> %data) { +; CHECK-LABEL: test_256_15: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x float>* + store <8 x float>%data, <8 x float>* %vaddr, align 1 + ret void +} + +define <8 x float> @test_256_16(i8 * %addr) { +; CHECK-LABEL: test_256_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: retq + %vaddr = bitcast i8* %addr to <8 x float>* + %res = load <8 x float>, <8 x float>* %vaddr, align 1 + ret <8 x float>%res +} + +define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { +; CHECK-LABEL: test_256_17: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>, <8 x i32>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old + ret <8 x i32>%res +} + +define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { +; CHECK-LABEL: test_256_18: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>, <8 x i32>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old + ret <8 x i32>%res +} + +define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) { +; CHECK-LABEL: test_256_19: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %ymm0, %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>, <8 x i32>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer + ret <8 x i32>%res +} + +define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) { +; CHECK-LABEL: test_256_20: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %ymm0, %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>, <8 x i32>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer + ret <8 x i32>%res +} + +define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { +; CHECK-LABEL: test_256_21: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>, <4 x i64>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old + ret <4 x i64>%res +} + +define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { +; CHECK-LABEL: test_256_22: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>, <4 x i64>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old + ret <4 x i64>%res +} + +define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) { +; CHECK-LABEL: test_256_23: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>, <4 x i64>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer + ret <4 x i64>%res +} + +define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) { +; CHECK-LABEL: test_256_24: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>, <4 x i64>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer + ret <4 x i64>%res +} + +define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { +; CHECK-LABEL: test_128_17: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>, <4 x i32>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old + ret <4 x i32>%res +} + +define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { +; CHECK-LABEL: test_128_18: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>, <4 x i32>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old + ret <4 x i32>%res +} + +define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) { +; CHECK-LABEL: test_128_19: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>, <4 x i32>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer + ret <4 x i32>%res +} + +define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) { +; CHECK-LABEL: test_128_20: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>, <4 x i32>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer + ret <4 x i32>%res +} + +define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { +; CHECK-LABEL: test_128_21: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>, <2 x i64>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old + ret <2 x i64>%res +} + +define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { +; CHECK-LABEL: test_128_22: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>, <2 x i64>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old + ret <2 x i64>%res +} + +define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) { +; CHECK-LABEL: test_128_23: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>, <2 x i64>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer + ret <2 x i64>%res +} + +define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) { +; CHECK-LABEL: test_128_24: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>, <2 x i64>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer + ret <2 x i64>%res +} diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -186,6 +186,7 @@ ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Atom pad short functions ; CHECK-NEXT: X86 LEA Fixup +; CHECK-NEXT: X86 unaligned vector move ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible ; CHECK-NEXT: X86 Discriminate Memory Operands ; CHECK-NEXT: X86 Insert Cache Prefetches